1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 #ifndef _RTL_URI_H_ 25 #define _RTL_URI_H_ 26 27 #include "rtl/textenc.h" 28 #include "rtl/ustring.h" 29 #include "sal/types.h" 30 31 #if defined __cplusplus 32 extern "C" { 33 #endif /* __cplusplus */ 34 35 /** Various predefined URI 'char classes.' 36 37 @descr 38 A 'char class' defines which (ASCII) characters can be written 'as they 39 are' in a part of a Uri, and which characters have to be written using 40 escape sequences ('%' followed by two hex digits). Characters outside 41 the ASCII range are always written using escape sequences. 42 43 @descr 44 If there are other frequently used char classes, they can be added to 45 this enumeration; the function rtl_getUriCharClass() has to be adapted 46 then, too. 47 */ 48 typedef enum 49 { 50 /** The empty char class. 51 52 @descr 53 All characters are written using escape sequences. 54 */ 55 rtl_UriCharClassNone, 56 57 /** The RFC 2732 <uric> char class. 58 59 @descr 60 The 'valid' characters are !$&'()*+,-./:;=?@[]_~ plus digits and 61 letters. 62 */ 63 rtl_UriCharClassUric, 64 65 /** The RFC 2396 <uric_no_slash> char class. 66 67 @descr 68 The 'valid' characters are !$&'()*+,-.:;=?@_~ plus digits and letters. 69 */ 70 rtl_UriCharClassUricNoSlash, 71 72 /** The RFC 2396 <rel_segment> char class. 73 74 @descr 75 The 'valid' characters are !$&'()*+,-.;=@_~ plus digits and letters. 76 */ 77 rtl_UriCharClassRelSegment, 78 79 /** The RFC 2396 <reg_name> char class. 80 81 @descr 82 The 'valid' characters are !$&'()*+,-.:;=@_~ plus digits and letters. 83 */ 84 rtl_UriCharClassRegName, 85 86 /** The RFC 2396 <userinfo> char class. 87 88 @descr 89 The 'valid' characters are !$&'()*+,-.:;=_~ plus digits and letters. 90 */ 91 rtl_UriCharClassUserinfo, 92 93 /** The RFC 2396 <pchar> char class. 94 95 @descr 96 The 'valid' characters are !$&'()*+,-.:=@_~ plus digits and letters. 97 */ 98 rtl_UriCharClassPchar, 99 100 /** The char class for the values of uno URL parameters. 101 102 @descr 103 The 'valid' characters are !$&'()*+-./:?@_~ plus digits and letters. 104 */ 105 rtl_UriCharClassUnoParamValue, 106 107 rtl_UriCharClass_FORCE_EQUAL_SIZE = SAL_MAX_ENUM 108 } 109 rtl_UriCharClass; 110 111 /** The mechanism describing how escape sequences in the input of 112 rtl_uriEncode() are handled. 113 */ 114 typedef enum 115 { 116 /** The special meaning of '%' is ignored (i.e., there are by definition 117 no escape sequences in the input). 118 119 @descr 120 This mechanism is useful to encode user input as part of a URI (e.g., 121 the user-supplied password in an ftp URL---'%20abcde' is a valid 122 password, so do not assume that the '%20' is an escaped space). 123 */ 124 rtl_UriEncodeIgnoreEscapes, 125 126 /** All escape sequences ('%' followed by two hex digits) are kept intact, 127 even if they represent characters that need not be escaped or if they 128 do not even map to characters in the given charset. 129 130 @descr 131 This mechanism is useful when passing on complete URIs more or less 132 unmodified (e.g., within an HTTP proxy): missing escape sequences are 133 added, but existing escape sequences are not touched (except that any 134 lower case hex digits are replaced by upper case hex digits). 135 */ 136 rtl_UriEncodeKeepEscapes, 137 138 /** All escape sequences ('%' followed by two hex digits) are resolved in 139 a first step; only those that represent characters that need to be 140 escaped are kept intact. 141 142 @descr 143 This mechanism is useful to properly encode complete URIs entered by 144 the user: the URI is brought into a 'canonic form,' but care is taken 145 not to damage (valid) escape sequences the (careful) user already 146 entered as such. 147 */ 148 rtl_UriEncodeCheckEscapes, 149 150 /** Like rtl_UriEncodeIgnoreEscapes, but indicating failure when converting 151 unmappable characters. 152 153 @since UDK 3.2.0 154 */ 155 rtl_UriEncodeStrict, 156 157 /** Like rtl_UriEncodeKeepEscapes, but indicating failure when converting 158 unmappable characters. 159 160 @since UDK 3.2.7 161 */ 162 rtl_UriEncodeStrictKeepEscapes, 163 164 rtl_UriEncode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM 165 } 166 rtl_UriEncodeMechanism; 167 168 /** The mechanism describing how rtl_uriDecode() translates (part of) a URI 169 into a Unicode string. 170 */ 171 typedef enum 172 { 173 /** The text is returned completely unmodified. 174 */ 175 rtl_UriDecodeNone, 176 177 /** The text is returned in the form of an IURI (cf. 178 draft-masinter-url-i18n-05.txt). 179 180 @descr 181 All escape sequences representing ASCII characters (%00--%7F) are 182 kept, all other escape sequences are interpreted as UTF-8 characters 183 and translated to Unicode, if possible. 184 */ 185 rtl_UriDecodeToIuri, 186 187 /** The text is decoded. 188 189 @descr 190 All escape sequences representing characters from the given charset 191 are decoded and translated to Unicode, if possible. 192 */ 193 rtl_UriDecodeWithCharset, 194 195 /** Like rtl_UriDecodeWithCharset, but indicating failure when converting 196 unmappable characters. 197 198 @since UDK 3.2.0 199 */ 200 rtl_UriDecodeStrict, 201 202 rtl_UriDecode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM 203 } 204 rtl_UriDecodeMechanism; 205 206 /** Map a predefined rtl_UriCharClass to a form usable by rtl_uriEncode(). 207 208 @descr 209 The function rtl_uriEncode() expects an array of 128 booleans, and this 210 function maps rtl_UriCharClass enumeration members to such arrays. 211 212 @param eCharClass 213 Any valid member of rtl_UriCharClass. 214 215 @return 216 An array of 128 booleans, to be used in calls to rtl_uriEncode(). 217 */ 218 sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass) 219 SAL_THROW_EXTERN_C(); 220 221 /** Encode a text as (part of) a URI. 222 223 @param pText 224 Any Unicode string. Must not be null. 225 226 @param pCharClass 227 A char class, represented as an array of 128 booleans (true means keep the 228 corresponding ASCII character unencoded, false means encode it). Must not 229 be null, and the boolean corresponding to the percent sign (0x25) must be 230 false. (See rtl_getUriCharClass() for a function mapping from 231 rtl_UriCharClass to such arrays.) 232 233 @param eMechanism 234 The mechanism describing how escape sequences in the input text are 235 handled. 236 237 @param eCharset 238 When Unicode characters from the input text have to be written using 239 escape sequences (because they are either outside the ASCII range or do 240 not belong to the given char class), they are first translated into this 241 charset before being encoded using escape sequences. 242 243 Also, if the encode mechanism is rtl_UriEncodeCheckEscapes, all escape 244 sequences already present in the input text are interpreted as characters 245 from this charset. 246 247 @param pResult 248 Returns an encoded representation of the input text. Must itself not be 249 null, and must point to either null or a valid string. 250 251 If the encode mechanism is rtl_UriEncodeStrict, and pText cannot be 252 converted to eCharset because it contains unmappable characters (which 253 implies that pText is not empty), then an empty string is returned. 254 */ 255 void SAL_CALL rtl_uriEncode(rtl_uString * pText, 256 sal_Bool const * pCharClass, 257 rtl_UriEncodeMechanism eMechanism, 258 rtl_TextEncoding eCharset, 259 rtl_uString ** pResult) 260 SAL_THROW_EXTERN_C(); 261 262 /** Decode (a part of) a URI. 263 264 @param pText 265 Any Unicode string. Must not be null. (If the input is indeed part of a 266 valid URI, this string will only contain a subset of the ASCII characters, 267 but this function also handles other Unicode characters properly.) 268 269 @param eMechanism 270 The mechanism describing how the input text is translated into a Unicode 271 string. 272 273 @param eCharset 274 When the decode mechanism is rtl_UriDecodeWithCharset, all escape 275 sequences in the input text are interpreted as characters from this 276 charset. Those characters are translated to Unicode characters in the 277 resulting output, if possible. 278 279 When the decode mechanism is rtl_UriDecodeNone or rtl_UriDecodeToIuri, 280 this parameter is ignored (and is best specified as 281 RTL_TEXTENCODING_UTF8). 282 283 @param pResult 284 Returns a decoded representation of the input text. Must itself not be 285 null, and must point to either null or a valid string. 286 287 If the decode mechanism is rtl_UriDecodeStrict, and pText cannot be 288 converted to eCharset because it contains (encodings of) unmappable 289 characters (which implies that pText is not empty), then an empty string is 290 returned. 291 */ 292 void SAL_CALL rtl_uriDecode(rtl_uString * pText, 293 rtl_UriDecodeMechanism eMechanism, 294 rtl_TextEncoding eCharset, 295 rtl_uString ** pResult) 296 SAL_THROW_EXTERN_C(); 297 298 /** Convert a relative URI reference into an absolute one. 299 300 A URI reference is a URI plus an optional <"#" fragment> part. 301 302 This function uses the algorithm described in RFC 2396, section 5.2, with 303 the following clarifications: (1) Backwards-compatible relative URIs 304 starting with a scheme component (see RFC 2396, section 5.2, step 3) are not 305 supported. (2) Segments "." and ".." within the path of the base URI are 306 not considered special, RFC 2396 seems a bit unlcear about that point. 307 (3) Erroneous excess segments ".." within the path of the relative URI (if 308 it is indeed relative) are left intact, as the examples in RFC 2396, 309 section C.2, suggest. (4) If the relative URI is a reference to the 310 "current document," the "current document" is taken to be the base URI. 311 312 This function signals exceptions by returning false and letting pException 313 point to a message explaining the exception. 314 315 @param pBaseUriRef 316 An absolute, hierarchical URI reference that serves as the base URI. If it 317 has to be inspected (i.e., pRelUriRef is not an absolute URI already), and 318 if it either is not an absolute URI (i.e., does not begin with a 319 <scheme ":"> part) or has a path that is non-empty but does not start 320 with "/", an exception will be signaled. 321 322 @param pRelUriRef 323 An URI reference that may be either absolute or relative. If it is 324 absolute, it will be returned unmodified (and it need not be hierarchical 325 then). 326 327 @param pResult 328 Returns an absolute URI reference. Must itself not be null, and must point 329 to either null or a valid string. If an exception is signalled, it is left 330 unchanged. 331 332 @param pException 333 Returns an explanatory message in case an exception is signalled. Must 334 itself not be null, and must point to either null or a valid string. If no 335 exception is signalled, it is left unchanged. 336 337 @return 338 True if no exception is signalled, otherwise false. 339 */ 340 sal_Bool SAL_CALL rtl_uriConvertRelToAbs(rtl_uString * pBaseUriRef, 341 rtl_uString * pRelUriRef, 342 rtl_uString ** pResult, 343 rtl_uString ** pException) 344 SAL_THROW_EXTERN_C(); 345 346 #if defined __cplusplus 347 } 348 #endif /* __cplusplus */ 349 350 #endif /* _RTL_URI_H_ */ 351