1*cdf0e10cSrcweir /************************************************************************* 2*cdf0e10cSrcweir * 3*cdf0e10cSrcweir * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4*cdf0e10cSrcweir * 5*cdf0e10cSrcweir * Copyright 2000, 2010 Oracle and/or its affiliates. 6*cdf0e10cSrcweir * 7*cdf0e10cSrcweir * OpenOffice.org - a multi-platform office productivity suite 8*cdf0e10cSrcweir * 9*cdf0e10cSrcweir * This file is part of OpenOffice.org. 10*cdf0e10cSrcweir * 11*cdf0e10cSrcweir * OpenOffice.org is free software: you can redistribute it and/or modify 12*cdf0e10cSrcweir * it under the terms of the GNU Lesser General Public License version 3 13*cdf0e10cSrcweir * only, as published by the Free Software Foundation. 14*cdf0e10cSrcweir * 15*cdf0e10cSrcweir * OpenOffice.org is distributed in the hope that it will be useful, 16*cdf0e10cSrcweir * but WITHOUT ANY WARRANTY; without even the implied warranty of 17*cdf0e10cSrcweir * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18*cdf0e10cSrcweir * GNU Lesser General Public License version 3 for more details 19*cdf0e10cSrcweir * (a copy is included in the LICENSE file that accompanied this code). 20*cdf0e10cSrcweir * 21*cdf0e10cSrcweir * You should have received a copy of the GNU Lesser General Public License 22*cdf0e10cSrcweir * version 3 along with OpenOffice.org. If not, see 23*cdf0e10cSrcweir * <http://www.openoffice.org/license.html> 24*cdf0e10cSrcweir * for a copy of the LGPLv3 License. 25*cdf0e10cSrcweir * 26*cdf0e10cSrcweir ************************************************************************/ 27*cdf0e10cSrcweir 28*cdf0e10cSrcweir #ifndef _RTL_URI_H_ 29*cdf0e10cSrcweir #define _RTL_URI_H_ 30*cdf0e10cSrcweir 31*cdf0e10cSrcweir #include "rtl/textenc.h" 32*cdf0e10cSrcweir #include "rtl/ustring.h" 33*cdf0e10cSrcweir #include "sal/types.h" 34*cdf0e10cSrcweir 35*cdf0e10cSrcweir #if defined __cplusplus 36*cdf0e10cSrcweir extern "C" { 37*cdf0e10cSrcweir #endif /* __cplusplus */ 38*cdf0e10cSrcweir 39*cdf0e10cSrcweir /** Various predefined URI 'char classes.' 40*cdf0e10cSrcweir 41*cdf0e10cSrcweir @descr 42*cdf0e10cSrcweir A 'char class' defines which (ASCII) characters can be written 'as they 43*cdf0e10cSrcweir are' in a part of a Uri, and which characters have to be written using 44*cdf0e10cSrcweir escape sequences ('%' followed by two hex digits). Characters outside 45*cdf0e10cSrcweir the ASCII range are always written using escape sequences. 46*cdf0e10cSrcweir 47*cdf0e10cSrcweir @descr 48*cdf0e10cSrcweir If there are other frequently used char classes, they can be added to 49*cdf0e10cSrcweir this enumeration; the function rtl_getUriCharClass() has to be adapted 50*cdf0e10cSrcweir then, too. 51*cdf0e10cSrcweir */ 52*cdf0e10cSrcweir typedef enum 53*cdf0e10cSrcweir { 54*cdf0e10cSrcweir /** The empty char class. 55*cdf0e10cSrcweir 56*cdf0e10cSrcweir @descr 57*cdf0e10cSrcweir All characters are written using escape sequences. 58*cdf0e10cSrcweir */ 59*cdf0e10cSrcweir rtl_UriCharClassNone, 60*cdf0e10cSrcweir 61*cdf0e10cSrcweir /** The RFC 2732 <uric> char class. 62*cdf0e10cSrcweir 63*cdf0e10cSrcweir @descr 64*cdf0e10cSrcweir The 'valid' characters are !$&'()*+,-./:;=?@[]_~ plus digits and 65*cdf0e10cSrcweir letters. 66*cdf0e10cSrcweir */ 67*cdf0e10cSrcweir rtl_UriCharClassUric, 68*cdf0e10cSrcweir 69*cdf0e10cSrcweir /** The RFC 2396 <uric_no_slash> char class. 70*cdf0e10cSrcweir 71*cdf0e10cSrcweir @descr 72*cdf0e10cSrcweir The 'valid' characters are !$&'()*+,-.:;=?@_~ plus digits and letters. 73*cdf0e10cSrcweir */ 74*cdf0e10cSrcweir rtl_UriCharClassUricNoSlash, 75*cdf0e10cSrcweir 76*cdf0e10cSrcweir /** The RFC 2396 <rel_segment> char class. 77*cdf0e10cSrcweir 78*cdf0e10cSrcweir @descr 79*cdf0e10cSrcweir The 'valid' characters are !$&'()*+,-.;=@_~ plus digits and letters. 80*cdf0e10cSrcweir */ 81*cdf0e10cSrcweir rtl_UriCharClassRelSegment, 82*cdf0e10cSrcweir 83*cdf0e10cSrcweir /** The RFC 2396 <reg_name> char class. 84*cdf0e10cSrcweir 85*cdf0e10cSrcweir @descr 86*cdf0e10cSrcweir The 'valid' characters are !$&'()*+,-.:;=@_~ plus digits and letters. 87*cdf0e10cSrcweir */ 88*cdf0e10cSrcweir rtl_UriCharClassRegName, 89*cdf0e10cSrcweir 90*cdf0e10cSrcweir /** The RFC 2396 <userinfo> char class. 91*cdf0e10cSrcweir 92*cdf0e10cSrcweir @descr 93*cdf0e10cSrcweir The 'valid' characters are !$&'()*+,-.:;=_~ plus digits and letters. 94*cdf0e10cSrcweir */ 95*cdf0e10cSrcweir rtl_UriCharClassUserinfo, 96*cdf0e10cSrcweir 97*cdf0e10cSrcweir /** The RFC 2396 <pchar> char class. 98*cdf0e10cSrcweir 99*cdf0e10cSrcweir @descr 100*cdf0e10cSrcweir The 'valid' characters are !$&'()*+,-.:=@_~ plus digits and letters. 101*cdf0e10cSrcweir */ 102*cdf0e10cSrcweir rtl_UriCharClassPchar, 103*cdf0e10cSrcweir 104*cdf0e10cSrcweir /** The char class for the values of uno URL parameters. 105*cdf0e10cSrcweir 106*cdf0e10cSrcweir @descr 107*cdf0e10cSrcweir The 'valid' characters are !$&'()*+-./:?@_~ plus digits and letters. 108*cdf0e10cSrcweir */ 109*cdf0e10cSrcweir rtl_UriCharClassUnoParamValue, 110*cdf0e10cSrcweir 111*cdf0e10cSrcweir rtl_UriCharClass_FORCE_EQUAL_SIZE = SAL_MAX_ENUM 112*cdf0e10cSrcweir } 113*cdf0e10cSrcweir rtl_UriCharClass; 114*cdf0e10cSrcweir 115*cdf0e10cSrcweir /** The mechanism describing how escape sequences in the input of 116*cdf0e10cSrcweir rtl_uriEncode() are handled. 117*cdf0e10cSrcweir */ 118*cdf0e10cSrcweir typedef enum 119*cdf0e10cSrcweir { 120*cdf0e10cSrcweir /** The special meaning of '%' is ignored (i.e., there are by definition 121*cdf0e10cSrcweir no escape sequences in the input). 122*cdf0e10cSrcweir 123*cdf0e10cSrcweir @descr 124*cdf0e10cSrcweir This mechanism is useful to encode user input as part of a URI (e.g., 125*cdf0e10cSrcweir the user-supplied password in an ftp URL---'%20abcde' is a valid 126*cdf0e10cSrcweir password, so do not assume that the '%20' is an escaped space). 127*cdf0e10cSrcweir */ 128*cdf0e10cSrcweir rtl_UriEncodeIgnoreEscapes, 129*cdf0e10cSrcweir 130*cdf0e10cSrcweir /** All escape sequences ('%' followed by two hex digits) are kept intact, 131*cdf0e10cSrcweir even if they represent characters that need not be escaped or if they 132*cdf0e10cSrcweir do not even map to characters in the given charset. 133*cdf0e10cSrcweir 134*cdf0e10cSrcweir @descr 135*cdf0e10cSrcweir This mechanism is useful when passing on complete URIs more or less 136*cdf0e10cSrcweir unmodified (e.g., within an HTTP proxy): missing escape sequences are 137*cdf0e10cSrcweir added, but existing escape sequences are not touched (except that any 138*cdf0e10cSrcweir lower case hex digits are replaced by upper case hex digits). 139*cdf0e10cSrcweir */ 140*cdf0e10cSrcweir rtl_UriEncodeKeepEscapes, 141*cdf0e10cSrcweir 142*cdf0e10cSrcweir /** All escape sequences ('%' followed by two hex digits) are resolved in 143*cdf0e10cSrcweir a first step; only those that represent characters that need to be 144*cdf0e10cSrcweir escaped are kept intact. 145*cdf0e10cSrcweir 146*cdf0e10cSrcweir @descr 147*cdf0e10cSrcweir This mechanism is useful to properly encode complete URIs entered by 148*cdf0e10cSrcweir the user: the URI is brought into a 'canonic form,' but care is taken 149*cdf0e10cSrcweir not to damage (valid) escape sequences the (careful) user already 150*cdf0e10cSrcweir entered as such. 151*cdf0e10cSrcweir */ 152*cdf0e10cSrcweir rtl_UriEncodeCheckEscapes, 153*cdf0e10cSrcweir 154*cdf0e10cSrcweir /** Like rtl_UriEncodeIgnoreEscapes, but indicating failure when converting 155*cdf0e10cSrcweir unmappable characters. 156*cdf0e10cSrcweir 157*cdf0e10cSrcweir @since UDK 3.2.0 158*cdf0e10cSrcweir */ 159*cdf0e10cSrcweir rtl_UriEncodeStrict, 160*cdf0e10cSrcweir 161*cdf0e10cSrcweir /** Like rtl_UriEncodeKeepEscapes, but indicating failure when converting 162*cdf0e10cSrcweir unmappable characters. 163*cdf0e10cSrcweir 164*cdf0e10cSrcweir @since UDK 3.2.7 165*cdf0e10cSrcweir */ 166*cdf0e10cSrcweir rtl_UriEncodeStrictKeepEscapes, 167*cdf0e10cSrcweir 168*cdf0e10cSrcweir rtl_UriEncode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM 169*cdf0e10cSrcweir } 170*cdf0e10cSrcweir rtl_UriEncodeMechanism; 171*cdf0e10cSrcweir 172*cdf0e10cSrcweir /** The mechanism describing how rtl_uriDecode() translates (part of) a URI 173*cdf0e10cSrcweir into a Unicode string. 174*cdf0e10cSrcweir */ 175*cdf0e10cSrcweir typedef enum 176*cdf0e10cSrcweir { 177*cdf0e10cSrcweir /** The text is returned completely unmodified. 178*cdf0e10cSrcweir */ 179*cdf0e10cSrcweir rtl_UriDecodeNone, 180*cdf0e10cSrcweir 181*cdf0e10cSrcweir /** The text is returned in the form of an IURI (cf. 182*cdf0e10cSrcweir draft-masinter-url-i18n-05.txt). 183*cdf0e10cSrcweir 184*cdf0e10cSrcweir @descr 185*cdf0e10cSrcweir All escape sequences representing ASCII characters (%00--%7F) are 186*cdf0e10cSrcweir kept, all other escape sequences are interpreted as UTF-8 characters 187*cdf0e10cSrcweir and translated to Unicode, if possible. 188*cdf0e10cSrcweir */ 189*cdf0e10cSrcweir rtl_UriDecodeToIuri, 190*cdf0e10cSrcweir 191*cdf0e10cSrcweir /** The text is decoded. 192*cdf0e10cSrcweir 193*cdf0e10cSrcweir @descr 194*cdf0e10cSrcweir All escape sequences representing characters from the given charset 195*cdf0e10cSrcweir are decoded and translated to Unicode, if possible. 196*cdf0e10cSrcweir */ 197*cdf0e10cSrcweir rtl_UriDecodeWithCharset, 198*cdf0e10cSrcweir 199*cdf0e10cSrcweir /** Like rtl_UriDecodeWithCharset, but indicating failure when converting 200*cdf0e10cSrcweir unmappable characters. 201*cdf0e10cSrcweir 202*cdf0e10cSrcweir @since UDK 3.2.0 203*cdf0e10cSrcweir */ 204*cdf0e10cSrcweir rtl_UriDecodeStrict, 205*cdf0e10cSrcweir 206*cdf0e10cSrcweir rtl_UriDecode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM 207*cdf0e10cSrcweir } 208*cdf0e10cSrcweir rtl_UriDecodeMechanism; 209*cdf0e10cSrcweir 210*cdf0e10cSrcweir /** Map a predefined rtl_UriCharClass to a form usable by rtl_uriEncode(). 211*cdf0e10cSrcweir 212*cdf0e10cSrcweir @descr 213*cdf0e10cSrcweir The function rtl_uriEncode() expects an array of 128 booleans, and this 214*cdf0e10cSrcweir function maps rtl_UriCharClass enumeration members to such arrays. 215*cdf0e10cSrcweir 216*cdf0e10cSrcweir @param eCharClass 217*cdf0e10cSrcweir Any valid member of rtl_UriCharClass. 218*cdf0e10cSrcweir 219*cdf0e10cSrcweir @return 220*cdf0e10cSrcweir An array of 128 booleans, to be used in calls to rtl_uriEncode(). 221*cdf0e10cSrcweir */ 222*cdf0e10cSrcweir sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass) 223*cdf0e10cSrcweir SAL_THROW_EXTERN_C(); 224*cdf0e10cSrcweir 225*cdf0e10cSrcweir /** Encode a text as (part of) a URI. 226*cdf0e10cSrcweir 227*cdf0e10cSrcweir @param pText 228*cdf0e10cSrcweir Any Unicode string. Must not be null. 229*cdf0e10cSrcweir 230*cdf0e10cSrcweir @param pCharClass 231*cdf0e10cSrcweir A char class, represented as an array of 128 booleans (true means keep the 232*cdf0e10cSrcweir corresponding ASCII character unencoded, false means encode it). Must not 233*cdf0e10cSrcweir be null, and the boolean corresponding to the percent sign (0x25) must be 234*cdf0e10cSrcweir false. (See rtl_getUriCharClass() for a function mapping from 235*cdf0e10cSrcweir rtl_UriCharClass to such arrays.) 236*cdf0e10cSrcweir 237*cdf0e10cSrcweir @param eMechanism 238*cdf0e10cSrcweir The mechanism describing how escape sequences in the input text are 239*cdf0e10cSrcweir handled. 240*cdf0e10cSrcweir 241*cdf0e10cSrcweir @param eCharset 242*cdf0e10cSrcweir When Unicode characters from the input text have to be written using 243*cdf0e10cSrcweir escape sequences (because they are either outside the ASCII range or do 244*cdf0e10cSrcweir not belong to the given char class), they are first translated into this 245*cdf0e10cSrcweir charset before being encoded using escape sequences. 246*cdf0e10cSrcweir 247*cdf0e10cSrcweir Also, if the encode mechanism is rtl_UriEncodeCheckEscapes, all escape 248*cdf0e10cSrcweir sequences already present in the input text are interpreted as characters 249*cdf0e10cSrcweir from this charset. 250*cdf0e10cSrcweir 251*cdf0e10cSrcweir @param pResult 252*cdf0e10cSrcweir Returns an encoded representation of the input text. Must itself not be 253*cdf0e10cSrcweir null, and must point to either null or a valid string. 254*cdf0e10cSrcweir 255*cdf0e10cSrcweir If the encode mechanism is rtl_UriEncodeStrict, and pText cannot be 256*cdf0e10cSrcweir converted to eCharset because it contains unmappable characters (which 257*cdf0e10cSrcweir implies that pText is not empty), then an empty string is returned. 258*cdf0e10cSrcweir */ 259*cdf0e10cSrcweir void SAL_CALL rtl_uriEncode(rtl_uString * pText, 260*cdf0e10cSrcweir sal_Bool const * pCharClass, 261*cdf0e10cSrcweir rtl_UriEncodeMechanism eMechanism, 262*cdf0e10cSrcweir rtl_TextEncoding eCharset, 263*cdf0e10cSrcweir rtl_uString ** pResult) 264*cdf0e10cSrcweir SAL_THROW_EXTERN_C(); 265*cdf0e10cSrcweir 266*cdf0e10cSrcweir /** Decode (a part of) a URI. 267*cdf0e10cSrcweir 268*cdf0e10cSrcweir @param pText 269*cdf0e10cSrcweir Any Unicode string. Must not be null. (If the input is indeed part of a 270*cdf0e10cSrcweir valid URI, this string will only contain a subset of the ASCII characters, 271*cdf0e10cSrcweir but this function also handles other Unicode characters properly.) 272*cdf0e10cSrcweir 273*cdf0e10cSrcweir @param eMechanism 274*cdf0e10cSrcweir The mechanism describing how the input text is translated into a Unicode 275*cdf0e10cSrcweir string. 276*cdf0e10cSrcweir 277*cdf0e10cSrcweir @param eCharset 278*cdf0e10cSrcweir When the decode mechanism is rtl_UriDecodeWithCharset, all escape 279*cdf0e10cSrcweir sequences in the input text are interpreted as characters from this 280*cdf0e10cSrcweir charset. Those characters are translated to Unicode characters in the 281*cdf0e10cSrcweir resulting output, if possible. 282*cdf0e10cSrcweir 283*cdf0e10cSrcweir When the decode mechanism is rtl_UriDecodeNone or rtl_UriDecodeToIuri, 284*cdf0e10cSrcweir this parameter is ignored (and is best specified as 285*cdf0e10cSrcweir RTL_TEXTENCODING_UTF8). 286*cdf0e10cSrcweir 287*cdf0e10cSrcweir @param pResult 288*cdf0e10cSrcweir Returns a decoded representation of the input text. Must itself not be 289*cdf0e10cSrcweir null, and must point to either null or a valid string. 290*cdf0e10cSrcweir 291*cdf0e10cSrcweir If the decode mechanism is rtl_UriDecodeStrict, and pText cannot be 292*cdf0e10cSrcweir converted to eCharset because it contains (encodings of) unmappable 293*cdf0e10cSrcweir characters (which implies that pText is not empty), then an empty string is 294*cdf0e10cSrcweir returned. 295*cdf0e10cSrcweir */ 296*cdf0e10cSrcweir void SAL_CALL rtl_uriDecode(rtl_uString * pText, 297*cdf0e10cSrcweir rtl_UriDecodeMechanism eMechanism, 298*cdf0e10cSrcweir rtl_TextEncoding eCharset, 299*cdf0e10cSrcweir rtl_uString ** pResult) 300*cdf0e10cSrcweir SAL_THROW_EXTERN_C(); 301*cdf0e10cSrcweir 302*cdf0e10cSrcweir /** Convert a relative URI reference into an absolute one. 303*cdf0e10cSrcweir 304*cdf0e10cSrcweir A URI reference is a URI plus an optional <"#" fragment> part. 305*cdf0e10cSrcweir 306*cdf0e10cSrcweir This function uses the algorithm described in RFC 2396, section 5.2, with 307*cdf0e10cSrcweir the following clarifications: (1) Backwards-compatible relative URIs 308*cdf0e10cSrcweir starting with a scheme component (see RFC 2396, section 5.2, step 3) are not 309*cdf0e10cSrcweir supported. (2) Segments "." and ".." within the path of the base URI are 310*cdf0e10cSrcweir not considered special, RFC 2396 seems a bit unlcear about that point. 311*cdf0e10cSrcweir (3) Erroneous excess segments ".." within the path of the relative URI (if 312*cdf0e10cSrcweir it is indeed relative) are left intact, as the examples in RFC 2396, 313*cdf0e10cSrcweir section C.2, suggest. (4) If the relative URI is a reference to the 314*cdf0e10cSrcweir "current document," the "current document" is taken to be the base URI. 315*cdf0e10cSrcweir 316*cdf0e10cSrcweir This function signals exceptions by returning false and letting pException 317*cdf0e10cSrcweir point to a message explaining the exception. 318*cdf0e10cSrcweir 319*cdf0e10cSrcweir @param pBaseUriRef 320*cdf0e10cSrcweir An absolute, hierarchical URI reference that serves as the base URI. If it 321*cdf0e10cSrcweir has to be inspected (i.e., pRelUriRef is not an absolute URI already), and 322*cdf0e10cSrcweir if it either is not an absolute URI (i.e., does not begin with a 323*cdf0e10cSrcweir <scheme ":"> part) or has a path that is non-empty but does not start 324*cdf0e10cSrcweir with "/", an exception will be signaled. 325*cdf0e10cSrcweir 326*cdf0e10cSrcweir @param pRelUriRef 327*cdf0e10cSrcweir An URI reference that may be either absolute or relative. If it is 328*cdf0e10cSrcweir absolute, it will be returned unmodified (and it need not be hierarchical 329*cdf0e10cSrcweir then). 330*cdf0e10cSrcweir 331*cdf0e10cSrcweir @param pResult 332*cdf0e10cSrcweir Returns an absolute URI reference. Must itself not be null, and must point 333*cdf0e10cSrcweir to either null or a valid string. If an exception is signalled, it is left 334*cdf0e10cSrcweir unchanged. 335*cdf0e10cSrcweir 336*cdf0e10cSrcweir @param pException 337*cdf0e10cSrcweir Returns an explanatory message in case an exception is signalled. Must 338*cdf0e10cSrcweir itself not be null, and must point to either null or a valid string. If no 339*cdf0e10cSrcweir exception is signalled, it is left unchanged. 340*cdf0e10cSrcweir 341*cdf0e10cSrcweir @return 342*cdf0e10cSrcweir True if no exception is signalled, otherwise false. 343*cdf0e10cSrcweir */ 344*cdf0e10cSrcweir sal_Bool SAL_CALL rtl_uriConvertRelToAbs(rtl_uString * pBaseUriRef, 345*cdf0e10cSrcweir rtl_uString * pRelUriRef, 346*cdf0e10cSrcweir rtl_uString ** pResult, 347*cdf0e10cSrcweir rtl_uString ** pException) 348*cdf0e10cSrcweir SAL_THROW_EXTERN_C(); 349*cdf0e10cSrcweir 350*cdf0e10cSrcweir #if defined __cplusplus 351*cdf0e10cSrcweir } 352*cdf0e10cSrcweir #endif /* __cplusplus */ 353*cdf0e10cSrcweir 354*cdf0e10cSrcweir #endif /* _RTL_URI_H_ */ 355