1*9eab2a37SAndrew Rist /************************************************************** 2cdf0e10cSrcweir * 3*9eab2a37SAndrew Rist * Licensed to the Apache Software Foundation (ASF) under one 4*9eab2a37SAndrew Rist * or more contributor license agreements. See the NOTICE file 5*9eab2a37SAndrew Rist * distributed with this work for additional information 6*9eab2a37SAndrew Rist * regarding copyright ownership. The ASF licenses this file 7*9eab2a37SAndrew Rist * to you under the Apache License, Version 2.0 (the 8*9eab2a37SAndrew Rist * "License"); you may not use this file except in compliance 9*9eab2a37SAndrew Rist * with the License. You may obtain a copy of the License at 10*9eab2a37SAndrew Rist * 11*9eab2a37SAndrew Rist * http://www.apache.org/licenses/LICENSE-2.0 12*9eab2a37SAndrew Rist * 13*9eab2a37SAndrew Rist * Unless required by applicable law or agreed to in writing, 14*9eab2a37SAndrew Rist * software distributed under the License is distributed on an 15*9eab2a37SAndrew Rist * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16*9eab2a37SAndrew Rist * KIND, either express or implied. See the License for the 17*9eab2a37SAndrew Rist * specific language governing permissions and limitations 18*9eab2a37SAndrew Rist * under the License. 19*9eab2a37SAndrew Rist * 20*9eab2a37SAndrew Rist *************************************************************/ 21*9eab2a37SAndrew Rist 22*9eab2a37SAndrew Rist 23cdf0e10cSrcweir 24cdf0e10cSrcweir #ifndef _RTL_URI_H_ 25cdf0e10cSrcweir #define _RTL_URI_H_ 26cdf0e10cSrcweir 27cdf0e10cSrcweir #include "rtl/textenc.h" 28cdf0e10cSrcweir #include "rtl/ustring.h" 29cdf0e10cSrcweir #include "sal/types.h" 30cdf0e10cSrcweir 31cdf0e10cSrcweir #if defined __cplusplus 32cdf0e10cSrcweir extern "C" { 33cdf0e10cSrcweir #endif /* __cplusplus */ 34cdf0e10cSrcweir 35cdf0e10cSrcweir /** Various predefined URI 'char classes.' 36cdf0e10cSrcweir 37cdf0e10cSrcweir @descr 38cdf0e10cSrcweir A 'char class' defines which (ASCII) characters can be written 'as they 39cdf0e10cSrcweir are' in a part of a Uri, and which characters have to be written using 40cdf0e10cSrcweir escape sequences ('%' followed by two hex digits). Characters outside 41cdf0e10cSrcweir the ASCII range are always written using escape sequences. 42cdf0e10cSrcweir 43cdf0e10cSrcweir @descr 44cdf0e10cSrcweir If there are other frequently used char classes, they can be added to 45cdf0e10cSrcweir this enumeration; the function rtl_getUriCharClass() has to be adapted 46cdf0e10cSrcweir then, too. 47cdf0e10cSrcweir */ 48cdf0e10cSrcweir typedef enum 49cdf0e10cSrcweir { 50cdf0e10cSrcweir /** The empty char class. 51cdf0e10cSrcweir 52cdf0e10cSrcweir @descr 53cdf0e10cSrcweir All characters are written using escape sequences. 54cdf0e10cSrcweir */ 55cdf0e10cSrcweir rtl_UriCharClassNone, 56cdf0e10cSrcweir 57cdf0e10cSrcweir /** The RFC 2732 <uric> char class. 58cdf0e10cSrcweir 59cdf0e10cSrcweir @descr 60cdf0e10cSrcweir The 'valid' characters are !$&'()*+,-./:;=?@[]_~ plus digits and 61cdf0e10cSrcweir letters. 62cdf0e10cSrcweir */ 63cdf0e10cSrcweir rtl_UriCharClassUric, 64cdf0e10cSrcweir 65cdf0e10cSrcweir /** The RFC 2396 <uric_no_slash> char class. 66cdf0e10cSrcweir 67cdf0e10cSrcweir @descr 68cdf0e10cSrcweir The 'valid' characters are !$&'()*+,-.:;=?@_~ plus digits and letters. 69cdf0e10cSrcweir */ 70cdf0e10cSrcweir rtl_UriCharClassUricNoSlash, 71cdf0e10cSrcweir 72cdf0e10cSrcweir /** The RFC 2396 <rel_segment> char class. 73cdf0e10cSrcweir 74cdf0e10cSrcweir @descr 75cdf0e10cSrcweir The 'valid' characters are !$&'()*+,-.;=@_~ plus digits and letters. 76cdf0e10cSrcweir */ 77cdf0e10cSrcweir rtl_UriCharClassRelSegment, 78cdf0e10cSrcweir 79cdf0e10cSrcweir /** The RFC 2396 <reg_name> char class. 80cdf0e10cSrcweir 81cdf0e10cSrcweir @descr 82cdf0e10cSrcweir The 'valid' characters are !$&'()*+,-.:;=@_~ plus digits and letters. 83cdf0e10cSrcweir */ 84cdf0e10cSrcweir rtl_UriCharClassRegName, 85cdf0e10cSrcweir 86cdf0e10cSrcweir /** The RFC 2396 <userinfo> char class. 87cdf0e10cSrcweir 88cdf0e10cSrcweir @descr 89cdf0e10cSrcweir The 'valid' characters are !$&'()*+,-.:;=_~ plus digits and letters. 90cdf0e10cSrcweir */ 91cdf0e10cSrcweir rtl_UriCharClassUserinfo, 92cdf0e10cSrcweir 93cdf0e10cSrcweir /** The RFC 2396 <pchar> char class. 94cdf0e10cSrcweir 95cdf0e10cSrcweir @descr 96cdf0e10cSrcweir The 'valid' characters are !$&'()*+,-.:=@_~ plus digits and letters. 97cdf0e10cSrcweir */ 98cdf0e10cSrcweir rtl_UriCharClassPchar, 99cdf0e10cSrcweir 100cdf0e10cSrcweir /** The char class for the values of uno URL parameters. 101cdf0e10cSrcweir 102cdf0e10cSrcweir @descr 103cdf0e10cSrcweir The 'valid' characters are !$&'()*+-./:?@_~ plus digits and letters. 104cdf0e10cSrcweir */ 105cdf0e10cSrcweir rtl_UriCharClassUnoParamValue, 106cdf0e10cSrcweir 107cdf0e10cSrcweir rtl_UriCharClass_FORCE_EQUAL_SIZE = SAL_MAX_ENUM 108cdf0e10cSrcweir } 109cdf0e10cSrcweir rtl_UriCharClass; 110cdf0e10cSrcweir 111cdf0e10cSrcweir /** The mechanism describing how escape sequences in the input of 112cdf0e10cSrcweir rtl_uriEncode() are handled. 113cdf0e10cSrcweir */ 114cdf0e10cSrcweir typedef enum 115cdf0e10cSrcweir { 116cdf0e10cSrcweir /** The special meaning of '%' is ignored (i.e., there are by definition 117cdf0e10cSrcweir no escape sequences in the input). 118cdf0e10cSrcweir 119cdf0e10cSrcweir @descr 120cdf0e10cSrcweir This mechanism is useful to encode user input as part of a URI (e.g., 121cdf0e10cSrcweir the user-supplied password in an ftp URL---'%20abcde' is a valid 122cdf0e10cSrcweir password, so do not assume that the '%20' is an escaped space). 123cdf0e10cSrcweir */ 124cdf0e10cSrcweir rtl_UriEncodeIgnoreEscapes, 125cdf0e10cSrcweir 126cdf0e10cSrcweir /** All escape sequences ('%' followed by two hex digits) are kept intact, 127cdf0e10cSrcweir even if they represent characters that need not be escaped or if they 128cdf0e10cSrcweir do not even map to characters in the given charset. 129cdf0e10cSrcweir 130cdf0e10cSrcweir @descr 131cdf0e10cSrcweir This mechanism is useful when passing on complete URIs more or less 132cdf0e10cSrcweir unmodified (e.g., within an HTTP proxy): missing escape sequences are 133cdf0e10cSrcweir added, but existing escape sequences are not touched (except that any 134cdf0e10cSrcweir lower case hex digits are replaced by upper case hex digits). 135cdf0e10cSrcweir */ 136cdf0e10cSrcweir rtl_UriEncodeKeepEscapes, 137cdf0e10cSrcweir 138cdf0e10cSrcweir /** All escape sequences ('%' followed by two hex digits) are resolved in 139cdf0e10cSrcweir a first step; only those that represent characters that need to be 140cdf0e10cSrcweir escaped are kept intact. 141cdf0e10cSrcweir 142cdf0e10cSrcweir @descr 143cdf0e10cSrcweir This mechanism is useful to properly encode complete URIs entered by 144cdf0e10cSrcweir the user: the URI is brought into a 'canonic form,' but care is taken 145cdf0e10cSrcweir not to damage (valid) escape sequences the (careful) user already 146cdf0e10cSrcweir entered as such. 147cdf0e10cSrcweir */ 148cdf0e10cSrcweir rtl_UriEncodeCheckEscapes, 149cdf0e10cSrcweir 150cdf0e10cSrcweir /** Like rtl_UriEncodeIgnoreEscapes, but indicating failure when converting 151cdf0e10cSrcweir unmappable characters. 152cdf0e10cSrcweir 153cdf0e10cSrcweir @since UDK 3.2.0 154cdf0e10cSrcweir */ 155cdf0e10cSrcweir rtl_UriEncodeStrict, 156cdf0e10cSrcweir 157cdf0e10cSrcweir /** Like rtl_UriEncodeKeepEscapes, but indicating failure when converting 158cdf0e10cSrcweir unmappable characters. 159cdf0e10cSrcweir 160cdf0e10cSrcweir @since UDK 3.2.7 161cdf0e10cSrcweir */ 162cdf0e10cSrcweir rtl_UriEncodeStrictKeepEscapes, 163cdf0e10cSrcweir 164cdf0e10cSrcweir rtl_UriEncode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM 165cdf0e10cSrcweir } 166cdf0e10cSrcweir rtl_UriEncodeMechanism; 167cdf0e10cSrcweir 168cdf0e10cSrcweir /** The mechanism describing how rtl_uriDecode() translates (part of) a URI 169cdf0e10cSrcweir into a Unicode string. 170cdf0e10cSrcweir */ 171cdf0e10cSrcweir typedef enum 172cdf0e10cSrcweir { 173cdf0e10cSrcweir /** The text is returned completely unmodified. 174cdf0e10cSrcweir */ 175cdf0e10cSrcweir rtl_UriDecodeNone, 176cdf0e10cSrcweir 177cdf0e10cSrcweir /** The text is returned in the form of an IURI (cf. 178cdf0e10cSrcweir draft-masinter-url-i18n-05.txt). 179cdf0e10cSrcweir 180cdf0e10cSrcweir @descr 181cdf0e10cSrcweir All escape sequences representing ASCII characters (%00--%7F) are 182cdf0e10cSrcweir kept, all other escape sequences are interpreted as UTF-8 characters 183cdf0e10cSrcweir and translated to Unicode, if possible. 184cdf0e10cSrcweir */ 185cdf0e10cSrcweir rtl_UriDecodeToIuri, 186cdf0e10cSrcweir 187cdf0e10cSrcweir /** The text is decoded. 188cdf0e10cSrcweir 189cdf0e10cSrcweir @descr 190cdf0e10cSrcweir All escape sequences representing characters from the given charset 191cdf0e10cSrcweir are decoded and translated to Unicode, if possible. 192cdf0e10cSrcweir */ 193cdf0e10cSrcweir rtl_UriDecodeWithCharset, 194cdf0e10cSrcweir 195cdf0e10cSrcweir /** Like rtl_UriDecodeWithCharset, but indicating failure when converting 196cdf0e10cSrcweir unmappable characters. 197cdf0e10cSrcweir 198cdf0e10cSrcweir @since UDK 3.2.0 199cdf0e10cSrcweir */ 200cdf0e10cSrcweir rtl_UriDecodeStrict, 201cdf0e10cSrcweir 202cdf0e10cSrcweir rtl_UriDecode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM 203cdf0e10cSrcweir } 204cdf0e10cSrcweir rtl_UriDecodeMechanism; 205cdf0e10cSrcweir 206cdf0e10cSrcweir /** Map a predefined rtl_UriCharClass to a form usable by rtl_uriEncode(). 207cdf0e10cSrcweir 208cdf0e10cSrcweir @descr 209cdf0e10cSrcweir The function rtl_uriEncode() expects an array of 128 booleans, and this 210cdf0e10cSrcweir function maps rtl_UriCharClass enumeration members to such arrays. 211cdf0e10cSrcweir 212cdf0e10cSrcweir @param eCharClass 213cdf0e10cSrcweir Any valid member of rtl_UriCharClass. 214cdf0e10cSrcweir 215cdf0e10cSrcweir @return 216cdf0e10cSrcweir An array of 128 booleans, to be used in calls to rtl_uriEncode(). 217cdf0e10cSrcweir */ 218cdf0e10cSrcweir sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass) 219cdf0e10cSrcweir SAL_THROW_EXTERN_C(); 220cdf0e10cSrcweir 221cdf0e10cSrcweir /** Encode a text as (part of) a URI. 222cdf0e10cSrcweir 223cdf0e10cSrcweir @param pText 224cdf0e10cSrcweir Any Unicode string. Must not be null. 225cdf0e10cSrcweir 226cdf0e10cSrcweir @param pCharClass 227cdf0e10cSrcweir A char class, represented as an array of 128 booleans (true means keep the 228cdf0e10cSrcweir corresponding ASCII character unencoded, false means encode it). Must not 229cdf0e10cSrcweir be null, and the boolean corresponding to the percent sign (0x25) must be 230cdf0e10cSrcweir false. (See rtl_getUriCharClass() for a function mapping from 231cdf0e10cSrcweir rtl_UriCharClass to such arrays.) 232cdf0e10cSrcweir 233cdf0e10cSrcweir @param eMechanism 234cdf0e10cSrcweir The mechanism describing how escape sequences in the input text are 235cdf0e10cSrcweir handled. 236cdf0e10cSrcweir 237cdf0e10cSrcweir @param eCharset 238cdf0e10cSrcweir When Unicode characters from the input text have to be written using 239cdf0e10cSrcweir escape sequences (because they are either outside the ASCII range or do 240cdf0e10cSrcweir not belong to the given char class), they are first translated into this 241cdf0e10cSrcweir charset before being encoded using escape sequences. 242cdf0e10cSrcweir 243cdf0e10cSrcweir Also, if the encode mechanism is rtl_UriEncodeCheckEscapes, all escape 244cdf0e10cSrcweir sequences already present in the input text are interpreted as characters 245cdf0e10cSrcweir from this charset. 246cdf0e10cSrcweir 247cdf0e10cSrcweir @param pResult 248cdf0e10cSrcweir Returns an encoded representation of the input text. Must itself not be 249cdf0e10cSrcweir null, and must point to either null or a valid string. 250cdf0e10cSrcweir 251cdf0e10cSrcweir If the encode mechanism is rtl_UriEncodeStrict, and pText cannot be 252cdf0e10cSrcweir converted to eCharset because it contains unmappable characters (which 253cdf0e10cSrcweir implies that pText is not empty), then an empty string is returned. 254cdf0e10cSrcweir */ 255cdf0e10cSrcweir void SAL_CALL rtl_uriEncode(rtl_uString * pText, 256cdf0e10cSrcweir sal_Bool const * pCharClass, 257cdf0e10cSrcweir rtl_UriEncodeMechanism eMechanism, 258cdf0e10cSrcweir rtl_TextEncoding eCharset, 259cdf0e10cSrcweir rtl_uString ** pResult) 260cdf0e10cSrcweir SAL_THROW_EXTERN_C(); 261cdf0e10cSrcweir 262cdf0e10cSrcweir /** Decode (a part of) a URI. 263cdf0e10cSrcweir 264cdf0e10cSrcweir @param pText 265cdf0e10cSrcweir Any Unicode string. Must not be null. (If the input is indeed part of a 266cdf0e10cSrcweir valid URI, this string will only contain a subset of the ASCII characters, 267cdf0e10cSrcweir but this function also handles other Unicode characters properly.) 268cdf0e10cSrcweir 269cdf0e10cSrcweir @param eMechanism 270cdf0e10cSrcweir The mechanism describing how the input text is translated into a Unicode 271cdf0e10cSrcweir string. 272cdf0e10cSrcweir 273cdf0e10cSrcweir @param eCharset 274cdf0e10cSrcweir When the decode mechanism is rtl_UriDecodeWithCharset, all escape 275cdf0e10cSrcweir sequences in the input text are interpreted as characters from this 276cdf0e10cSrcweir charset. Those characters are translated to Unicode characters in the 277cdf0e10cSrcweir resulting output, if possible. 278cdf0e10cSrcweir 279cdf0e10cSrcweir When the decode mechanism is rtl_UriDecodeNone or rtl_UriDecodeToIuri, 280cdf0e10cSrcweir this parameter is ignored (and is best specified as 281cdf0e10cSrcweir RTL_TEXTENCODING_UTF8). 282cdf0e10cSrcweir 283cdf0e10cSrcweir @param pResult 284cdf0e10cSrcweir Returns a decoded representation of the input text. Must itself not be 285cdf0e10cSrcweir null, and must point to either null or a valid string. 286cdf0e10cSrcweir 287cdf0e10cSrcweir If the decode mechanism is rtl_UriDecodeStrict, and pText cannot be 288cdf0e10cSrcweir converted to eCharset because it contains (encodings of) unmappable 289cdf0e10cSrcweir characters (which implies that pText is not empty), then an empty string is 290cdf0e10cSrcweir returned. 291cdf0e10cSrcweir */ 292cdf0e10cSrcweir void SAL_CALL rtl_uriDecode(rtl_uString * pText, 293cdf0e10cSrcweir rtl_UriDecodeMechanism eMechanism, 294cdf0e10cSrcweir rtl_TextEncoding eCharset, 295cdf0e10cSrcweir rtl_uString ** pResult) 296cdf0e10cSrcweir SAL_THROW_EXTERN_C(); 297cdf0e10cSrcweir 298cdf0e10cSrcweir /** Convert a relative URI reference into an absolute one. 299cdf0e10cSrcweir 300cdf0e10cSrcweir A URI reference is a URI plus an optional <"#" fragment> part. 301cdf0e10cSrcweir 302cdf0e10cSrcweir This function uses the algorithm described in RFC 2396, section 5.2, with 303cdf0e10cSrcweir the following clarifications: (1) Backwards-compatible relative URIs 304cdf0e10cSrcweir starting with a scheme component (see RFC 2396, section 5.2, step 3) are not 305cdf0e10cSrcweir supported. (2) Segments "." and ".." within the path of the base URI are 306cdf0e10cSrcweir not considered special, RFC 2396 seems a bit unlcear about that point. 307cdf0e10cSrcweir (3) Erroneous excess segments ".." within the path of the relative URI (if 308cdf0e10cSrcweir it is indeed relative) are left intact, as the examples in RFC 2396, 309cdf0e10cSrcweir section C.2, suggest. (4) If the relative URI is a reference to the 310cdf0e10cSrcweir "current document," the "current document" is taken to be the base URI. 311cdf0e10cSrcweir 312cdf0e10cSrcweir This function signals exceptions by returning false and letting pException 313cdf0e10cSrcweir point to a message explaining the exception. 314cdf0e10cSrcweir 315cdf0e10cSrcweir @param pBaseUriRef 316cdf0e10cSrcweir An absolute, hierarchical URI reference that serves as the base URI. If it 317cdf0e10cSrcweir has to be inspected (i.e., pRelUriRef is not an absolute URI already), and 318cdf0e10cSrcweir if it either is not an absolute URI (i.e., does not begin with a 319cdf0e10cSrcweir <scheme ":"> part) or has a path that is non-empty but does not start 320cdf0e10cSrcweir with "/", an exception will be signaled. 321cdf0e10cSrcweir 322cdf0e10cSrcweir @param pRelUriRef 323cdf0e10cSrcweir An URI reference that may be either absolute or relative. If it is 324cdf0e10cSrcweir absolute, it will be returned unmodified (and it need not be hierarchical 325cdf0e10cSrcweir then). 326cdf0e10cSrcweir 327cdf0e10cSrcweir @param pResult 328cdf0e10cSrcweir Returns an absolute URI reference. Must itself not be null, and must point 329cdf0e10cSrcweir to either null or a valid string. If an exception is signalled, it is left 330cdf0e10cSrcweir unchanged. 331cdf0e10cSrcweir 332cdf0e10cSrcweir @param pException 333cdf0e10cSrcweir Returns an explanatory message in case an exception is signalled. Must 334cdf0e10cSrcweir itself not be null, and must point to either null or a valid string. If no 335cdf0e10cSrcweir exception is signalled, it is left unchanged. 336cdf0e10cSrcweir 337cdf0e10cSrcweir @return 338cdf0e10cSrcweir True if no exception is signalled, otherwise false. 339cdf0e10cSrcweir */ 340cdf0e10cSrcweir sal_Bool SAL_CALL rtl_uriConvertRelToAbs(rtl_uString * pBaseUriRef, 341cdf0e10cSrcweir rtl_uString * pRelUriRef, 342cdf0e10cSrcweir rtl_uString ** pResult, 343cdf0e10cSrcweir rtl_uString ** pException) 344cdf0e10cSrcweir SAL_THROW_EXTERN_C(); 345cdf0e10cSrcweir 346cdf0e10cSrcweir #if defined __cplusplus 347cdf0e10cSrcweir } 348cdf0e10cSrcweir #endif /* __cplusplus */ 349cdf0e10cSrcweir 350cdf0e10cSrcweir #endif /* _RTL_URI_H_ */ 351