1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 // MARKER(update_precomp.py): autogen include statement, do not remove 29 #include "precompiled_ucb.hxx" 30 #include <regexp.hxx> 31 32 #include <cstddef> 33 34 #include "osl/diagnose.h" 35 #include <com/sun/star/lang/IllegalArgumentException.hpp> 36 #include <rtl/ustrbuf.hxx> 37 #include <rtl/ustring.hxx> 38 39 namespace unnamed_ucb_regexp {} using namespace unnamed_ucb_regexp; 40 // unnamed namespaces don't work well yet... 41 42 using namespace com::sun::star; 43 using namespace ucb_impl; 44 45 //============================================================================ 46 // 47 // Regexp 48 // 49 //============================================================================ 50 51 inline Regexp::Regexp(Kind eTheKind, rtl::OUString const & rThePrefix, 52 bool bTheEmptyDomain, rtl::OUString const & rTheInfix, 53 bool bTheTranslation, 54 rtl::OUString const & rTheReversePrefix): 55 m_eKind(eTheKind), 56 m_aPrefix(rThePrefix), 57 m_aInfix(rTheInfix), 58 m_aReversePrefix(rTheReversePrefix), 59 m_bEmptyDomain(bTheEmptyDomain), 60 m_bTranslation(bTheTranslation) 61 { 62 OSL_ASSERT(m_eKind == KIND_DOMAIN 63 || !m_bEmptyDomain && m_aInfix.getLength() == 0); 64 OSL_ASSERT(m_bTranslation || m_aReversePrefix.getLength() == 0); 65 } 66 67 //============================================================================ 68 namespace unnamed_ucb_regexp { 69 70 bool matchStringIgnoreCase(sal_Unicode const ** pBegin, 71 sal_Unicode const * pEnd, 72 rtl::OUString const & rString) 73 { 74 sal_Unicode const * p = *pBegin; 75 76 sal_Unicode const * q = rString.getStr(); 77 sal_Unicode const * qEnd = q + rString.getLength(); 78 79 if (pEnd - p < qEnd - q) 80 return false; 81 82 while (q != qEnd) 83 { 84 sal_Unicode c1 = *p++; 85 sal_Unicode c2 = *q++; 86 if (c1 >= 'a' && c1 <= 'z') 87 c1 -= 'a' - 'A'; 88 if (c2 >= 'a' && c2 <= 'z') 89 c2 -= 'a' - 'A'; 90 if (c1 != c2) 91 return false; 92 } 93 94 *pBegin = p; 95 return true; 96 } 97 98 } 99 100 bool Regexp::matches(rtl::OUString const & rString, 101 rtl::OUString * pTranslation, bool * pTranslated) const 102 { 103 sal_Unicode const * pBegin = rString.getStr(); 104 sal_Unicode const * pEnd = pBegin + rString.getLength(); 105 106 bool bMatches = false; 107 108 sal_Unicode const * p = pBegin; 109 if (matchStringIgnoreCase(&p, pEnd, m_aPrefix)) 110 { 111 sal_Unicode const * pBlock1Begin = p; 112 sal_Unicode const * pBlock1End = pEnd; 113 114 sal_Unicode const * pBlock2Begin = 0; 115 sal_Unicode const * pBlock2End = 0; 116 117 switch (m_eKind) 118 { 119 case KIND_PREFIX: 120 bMatches = true; 121 break; 122 123 case KIND_AUTHORITY: 124 bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#'; 125 break; 126 127 case KIND_DOMAIN: 128 if (!m_bEmptyDomain) 129 { 130 if (p == pEnd || *p == '/' || *p == '?' || *p == '#') 131 break; 132 ++p; 133 } 134 for (;;) 135 { 136 sal_Unicode const * q = p; 137 if (matchStringIgnoreCase(&q, pEnd, m_aInfix) 138 && (q == pEnd || *q == '/' || *q == '?' || *q == '#')) 139 { 140 bMatches = true; 141 pBlock1End = p; 142 pBlock2Begin = q; 143 pBlock2End = pEnd; 144 break; 145 } 146 147 if (p == pEnd) 148 break; 149 150 sal_Unicode c = *p++; 151 if (c == '/' || c == '?' || c == '#') 152 break; 153 } 154 break; 155 } 156 157 if (bMatches) 158 { 159 if (m_bTranslation) 160 { 161 if (pTranslation) 162 { 163 rtl::OUStringBuffer aBuffer(m_aReversePrefix); 164 aBuffer.append(pBlock1Begin, pBlock1End - pBlock1Begin); 165 aBuffer.append(m_aInfix); 166 aBuffer.append(pBlock2Begin, pBlock2End - pBlock2Begin); 167 *pTranslation = aBuffer.makeStringAndClear(); 168 } 169 if (pTranslated) 170 *pTranslated = true; 171 } 172 else 173 { 174 if (pTranslation) 175 *pTranslation = rString; 176 if (pTranslated) 177 *pTranslated = false; 178 } 179 } 180 } 181 182 return bMatches; 183 } 184 185 //============================================================================ 186 namespace unnamed_ucb_regexp { 187 188 inline bool isAlpha(sal_Unicode c) 189 { 190 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); 191 } 192 193 inline bool isDigit(sal_Unicode c) 194 { 195 return c >= '0' && c <= '9'; 196 } 197 198 bool isScheme(rtl::OUString const & rString, bool bColon) 199 { 200 // Return true if rString matches <scheme> (plus a trailing ":" if bColon 201 // is true) from RFC 2396: 202 sal_Unicode const * p = rString.getStr(); 203 sal_Unicode const * pEnd = p + rString.getLength(); 204 if (p != pEnd && isAlpha(*p)) 205 for (++p;;) 206 { 207 if (p == pEnd) 208 return !bColon; 209 sal_Unicode c = *p++; 210 if (!(isAlpha(c) || isDigit(c) 211 || c == '+' || c == '-' || c == '.')) 212 return bColon && c == ':' && p == pEnd; 213 } 214 return false; 215 } 216 217 void appendStringLiteral(rtl::OUStringBuffer * pBuffer, 218 rtl::OUString const & rString) 219 { 220 OSL_ASSERT(pBuffer); 221 222 pBuffer->append(sal_Unicode('"')); 223 sal_Unicode const * p = rString.getStr(); 224 sal_Unicode const * pEnd = p + rString.getLength(); 225 while (p != pEnd) 226 { 227 sal_Unicode c = *p++; 228 if (c == '"' || c == '\\') 229 pBuffer->append(sal_Unicode('\\')); 230 pBuffer->append(c); 231 } 232 pBuffer->append(sal_Unicode('"')); 233 } 234 235 } 236 237 rtl::OUString Regexp::getRegexp(bool bReverse) const 238 { 239 if (m_bTranslation) 240 { 241 rtl::OUStringBuffer aBuffer; 242 if (bReverse) 243 { 244 if (m_aReversePrefix.getLength() != 0) 245 appendStringLiteral(&aBuffer, m_aReversePrefix); 246 } 247 else 248 { 249 if (m_aPrefix.getLength() != 0) 250 appendStringLiteral(&aBuffer, m_aPrefix); 251 } 252 switch (m_eKind) 253 { 254 case KIND_PREFIX: 255 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("(.*)")); 256 break; 257 258 case KIND_AUTHORITY: 259 aBuffer. 260 appendAscii(RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)")); 261 break; 262 263 case KIND_DOMAIN: 264 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([^/?#]")); 265 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+')); 266 if (m_aInfix.getLength() != 0) 267 appendStringLiteral(&aBuffer, m_aInfix); 268 aBuffer. 269 appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?)")); 270 break; 271 } 272 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("->")); 273 if (bReverse) 274 { 275 if (m_aPrefix.getLength() != 0) 276 appendStringLiteral(&aBuffer, m_aPrefix); 277 } 278 else 279 { 280 if (m_aReversePrefix.getLength() != 0) 281 appendStringLiteral(&aBuffer, m_aReversePrefix); 282 } 283 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("\\1")); 284 return aBuffer.makeStringAndClear(); 285 } 286 else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true)) 287 return m_aPrefix.copy(0, m_aPrefix.getLength() - 1); 288 else 289 { 290 rtl::OUStringBuffer aBuffer; 291 if (m_aPrefix.getLength() != 0) 292 appendStringLiteral(&aBuffer, m_aPrefix); 293 switch (m_eKind) 294 { 295 case KIND_PREFIX: 296 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM(".*")); 297 break; 298 299 case KIND_AUTHORITY: 300 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")); 301 break; 302 303 case KIND_DOMAIN: 304 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("[^/?#]")); 305 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+')); 306 if (m_aInfix.getLength() != 0) 307 appendStringLiteral(&aBuffer, m_aInfix); 308 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")); 309 break; 310 } 311 return aBuffer.makeStringAndClear(); 312 } 313 } 314 315 //============================================================================ 316 namespace unnamed_ucb_regexp { 317 318 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd, 319 sal_Char const * pString, size_t nStringLength) 320 { 321 sal_Unicode const * p = *pBegin; 322 323 sal_uChar const * q = reinterpret_cast< sal_uChar const * >(pString); 324 sal_uChar const * qEnd = q + nStringLength; 325 326 if (pEnd - p < qEnd - q) 327 return false; 328 329 while (q != qEnd) 330 { 331 sal_Unicode c1 = *p++; 332 sal_Unicode c2 = *q++; 333 if (c1 != c2) 334 return false; 335 } 336 337 *pBegin = p; 338 return true; 339 } 340 341 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd, 342 rtl::OUString * pString) 343 { 344 sal_Unicode const * p = *pBegin; 345 346 if (p == pEnd || *p++ != '"') 347 return false; 348 349 rtl::OUStringBuffer aBuffer; 350 for (;;) 351 { 352 if (p == pEnd) 353 return false; 354 sal_Unicode c = *p++; 355 if (c == '"') 356 break; 357 if (c == '\\') 358 { 359 if (p == pEnd) 360 return false; 361 c = *p++; 362 if (c != '"' && c != '\\') 363 return false; 364 } 365 aBuffer.append(c); 366 } 367 368 *pBegin = p; 369 *pString = aBuffer.makeStringAndClear(); 370 return true; 371 } 372 373 } 374 375 Regexp Regexp::parse(rtl::OUString const & rRegexp) 376 { 377 // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*' 378 // where <scheme> is as defined in RFC 2396: 379 if (isScheme(rRegexp, false)) 380 return Regexp(Regexp::KIND_PREFIX, 381 rRegexp 382 + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM(":")), 383 false, 384 rtl::OUString(), 385 false, 386 rtl::OUString()); 387 388 sal_Unicode const * p = rRegexp.getStr(); 389 sal_Unicode const * pEnd = p + rRegexp.getLength(); 390 391 rtl::OUString aPrefix; 392 scanStringLiteral(&p, pEnd, &aPrefix); 393 394 if (p == pEnd) 395 throw lang::IllegalArgumentException(); 396 397 if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*"))) 398 { 399 if (p != pEnd) 400 throw lang::IllegalArgumentException(); 401 402 return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(), 403 false, rtl::OUString()); 404 } 405 else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->"))) 406 { 407 rtl::OUString aReversePrefix; 408 scanStringLiteral(&p, pEnd, &aReversePrefix); 409 410 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1")) 411 || p != pEnd) 412 throw lang::IllegalArgumentException(); 413 414 return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(), 415 true, aReversePrefix); 416 } 417 else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"))) 418 { 419 if (p != pEnd) 420 throw lang::IllegalArgumentException(); 421 422 return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(), 423 false, rtl::OUString()); 424 } 425 else if (matchString(&p, pEnd, 426 RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->"))) 427 { 428 rtl::OUString aReversePrefix; 429 if (!(scanStringLiteral(&p, pEnd, &aReversePrefix) 430 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1")) 431 && p == pEnd)) 432 throw lang::IllegalArgumentException(); 433 434 return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(), 435 true, aReversePrefix); 436 } 437 else 438 { 439 bool bOpen = false; 440 if (p != pEnd && *p == '(') 441 { 442 ++p; 443 bOpen = true; 444 } 445 446 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]"))) 447 throw lang::IllegalArgumentException(); 448 449 if (p == pEnd || (*p != '*' && *p != '+')) 450 throw lang::IllegalArgumentException(); 451 bool bEmptyDomain = *p++ == '*'; 452 453 rtl::OUString aInfix; 454 scanStringLiteral(&p, pEnd, &aInfix); 455 456 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"))) 457 throw lang::IllegalArgumentException(); 458 459 rtl::OUString aReversePrefix; 460 if (bOpen 461 && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->")) 462 && scanStringLiteral(&p, pEnd, &aReversePrefix) 463 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1")))) 464 throw lang::IllegalArgumentException(); 465 466 if (p != pEnd) 467 throw lang::IllegalArgumentException(); 468 469 return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix, 470 bOpen, aReversePrefix); 471 } 472 } 473 474