xref: /trunk/main/sal/inc/rtl/uri.h (revision 9eab2a37)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 #ifndef _RTL_URI_H_
25 #define _RTL_URI_H_
26 
27 #include "rtl/textenc.h"
28 #include "rtl/ustring.h"
29 #include "sal/types.h"
30 
31 #if defined __cplusplus
32 extern "C" {
33 #endif /* __cplusplus */
34 
35 /**  Various predefined URI 'char classes.'
36 
37      @descr
38      A 'char class' defines which (ASCII) characters can be written 'as they
39      are' in a part of a Uri, and which characters have to be written using
40      escape sequences ('%' followed by two hex digits).  Characters outside
41      the ASCII range are always written using escape sequences.
42 
43      @descr
44      If there are other frequently used char classes, they can be added to
45      this enumeration; the function rtl_getUriCharClass() has to be adapted
46      then, too.
47  */
48 typedef enum
49 {
50     /** The empty char class.
51 
52         @descr
53         All characters are written using escape sequences.
54      */
55     rtl_UriCharClassNone,
56 
57     /** The RFC 2732 <uric> char class.
58 
59         @descr
60         The 'valid' characters are !$&'()*+,-./:;=?@[]_~ plus digits and
61         letters.
62      */
63     rtl_UriCharClassUric,
64 
65     /** The RFC 2396 <uric_no_slash> char class.
66 
67         @descr
68         The 'valid' characters are !$&'()*+,-.:;=?@_~ plus digits and letters.
69      */
70     rtl_UriCharClassUricNoSlash,
71 
72     /** The RFC 2396 <rel_segment> char class.
73 
74         @descr
75         The 'valid' characters are !$&'()*+,-.;=@_~ plus digits and letters.
76      */
77     rtl_UriCharClassRelSegment,
78 
79     /** The RFC 2396 <reg_name> char class.
80 
81         @descr
82         The 'valid' characters are !$&'()*+,-.:;=@_~ plus digits and letters.
83      */
84     rtl_UriCharClassRegName,
85 
86     /** The RFC 2396 <userinfo> char class.
87 
88         @descr
89         The 'valid' characters are !$&'()*+,-.:;=_~ plus digits and letters.
90      */
91     rtl_UriCharClassUserinfo,
92 
93     /** The RFC 2396 <pchar> char class.
94 
95         @descr
96         The 'valid' characters are !$&'()*+,-.:=@_~ plus digits and letters.
97      */
98     rtl_UriCharClassPchar,
99 
100     /** The char class for the values of uno URL parameters.
101 
102         @descr
103         The 'valid' characters are !$&'()*+-./:?@_~ plus digits and letters.
104      */
105     rtl_UriCharClassUnoParamValue,
106 
107     rtl_UriCharClass_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
108 }
109 rtl_UriCharClass;
110 
111 /** The mechanism describing how escape sequences in the input of
112     rtl_uriEncode() are handled.
113  */
114 typedef enum
115 {
116     /** The special meaning of '%' is ignored (i.e., there are by definition
117         no escape sequences in the input).
118 
119         @descr
120         This mechanism is useful to encode user input as part of a URI (e.g.,
121         the user-supplied password in an ftp URL---'%20abcde' is a valid
122         password, so do not assume that the '%20' is an escaped space).
123      */
124     rtl_UriEncodeIgnoreEscapes,
125 
126     /** All escape sequences ('%' followed by two hex digits) are kept intact,
127         even if they represent characters that need not be escaped or if they
128         do not even map to characters in the given charset.
129 
130         @descr
131         This mechanism is useful when passing on complete URIs more or less
132         unmodified (e.g., within an HTTP proxy): missing escape sequences are
133         added, but existing escape sequences are not touched (except that any
134         lower case hex digits are replaced by upper case hex digits).
135      */
136     rtl_UriEncodeKeepEscapes,
137 
138     /** All escape sequences ('%' followed by two hex digits) are resolved in
139         a first step; only those that represent characters that need to be
140         escaped are kept intact.
141 
142         @descr
143         This mechanism is useful to properly encode complete URIs entered by
144         the user: the URI is brought into a 'canonic form,' but care is taken
145         not to damage (valid) escape sequences the (careful) user already
146         entered as such.
147      */
148     rtl_UriEncodeCheckEscapes,
149 
150     /** Like rtl_UriEncodeIgnoreEscapes, but indicating failure when converting
151         unmappable characters.
152 
153         @since UDK 3.2.0
154      */
155     rtl_UriEncodeStrict,
156 
157     /** Like rtl_UriEncodeKeepEscapes, but indicating failure when converting
158         unmappable characters.
159 
160         @since UDK 3.2.7
161      */
162     rtl_UriEncodeStrictKeepEscapes,
163 
164     rtl_UriEncode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
165 }
166 rtl_UriEncodeMechanism;
167 
168 /** The mechanism describing how rtl_uriDecode() translates (part of) a URI
169     into a Unicode string.
170  */
171 typedef enum
172 {
173     /** The text is returned completely unmodified.
174      */
175     rtl_UriDecodeNone,
176 
177     /** The text is returned in the form of an IURI (cf.
178         draft-masinter-url-i18n-05.txt).
179 
180         @descr
181         All escape sequences representing ASCII characters (%00--%7F) are
182         kept, all other escape sequences are interpreted as UTF-8 characters
183         and translated to Unicode, if possible.
184      */
185     rtl_UriDecodeToIuri,
186 
187     /** The text is decoded.
188 
189         @descr
190         All escape sequences representing characters from the given charset
191         are decoded and translated to Unicode, if possible.
192      */
193     rtl_UriDecodeWithCharset,
194 
195     /** Like rtl_UriDecodeWithCharset, but indicating failure when converting
196         unmappable characters.
197 
198         @since UDK 3.2.0
199      */
200     rtl_UriDecodeStrict,
201 
202     rtl_UriDecode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
203 }
204 rtl_UriDecodeMechanism;
205 
206 /** Map a predefined rtl_UriCharClass to a form usable by rtl_uriEncode().
207 
208     @descr
209     The function rtl_uriEncode() expects an array of 128 booleans, and this
210     function maps rtl_UriCharClass enumeration members to such arrays.
211 
212     @param eCharClass
213     Any valid member of rtl_UriCharClass.
214 
215     @return
216     An array of 128 booleans, to be used in calls to rtl_uriEncode().
217  */
218 sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass)
219     SAL_THROW_EXTERN_C();
220 
221 /** Encode a text as (part of) a URI.
222 
223     @param pText
224     Any Unicode string.  Must not be null.
225 
226     @param pCharClass
227     A char class, represented as an array of 128 booleans (true means keep the
228     corresponding ASCII character unencoded, false means encode it).  Must not
229     be null, and the boolean corresponding to the percent sign (0x25) must be
230     false.  (See rtl_getUriCharClass() for a function mapping from
231     rtl_UriCharClass to such arrays.)
232 
233     @param eMechanism
234     The mechanism describing how escape sequences in the input text are
235     handled.
236 
237     @param eCharset
238     When Unicode characters from the input text have to be written using
239     escape sequences (because they are either outside the ASCII range or do
240     not belong to the given char class), they are first translated into this
241     charset before being encoded using escape sequences.
242 
243     Also, if the encode mechanism is rtl_UriEncodeCheckEscapes, all escape
244     sequences already present in the input text are interpreted as characters
245     from this charset.
246 
247     @param pResult
248     Returns an encoded representation of the input text.  Must itself not be
249     null, and must point to either null or a valid string.
250 
251     If the encode mechanism is rtl_UriEncodeStrict, and pText cannot be
252     converted to eCharset because it contains unmappable characters (which
253     implies that pText is not empty), then an empty string is returned.
254  */
255 void SAL_CALL rtl_uriEncode(rtl_uString * pText,
256                             sal_Bool const * pCharClass,
257                             rtl_UriEncodeMechanism eMechanism,
258                             rtl_TextEncoding eCharset,
259                             rtl_uString ** pResult)
260     SAL_THROW_EXTERN_C();
261 
262 /** Decode (a part of) a URI.
263 
264     @param pText
265     Any Unicode string.  Must not be null.  (If the input is indeed part of a
266     valid URI, this string will only contain a subset of the ASCII characters,
267     but this function also handles other Unicode characters properly.)
268 
269     @param eMechanism
270     The mechanism describing how the input text is translated into a Unicode
271     string.
272 
273     @param eCharset
274     When the decode mechanism is rtl_UriDecodeWithCharset, all escape
275     sequences in the input text are interpreted as characters from this
276     charset.  Those characters are translated to Unicode characters in the
277     resulting output, if possible.
278 
279     When the decode mechanism is rtl_UriDecodeNone or rtl_UriDecodeToIuri,
280     this parameter is ignored (and is best specified as
281     RTL_TEXTENCODING_UTF8).
282 
283     @param pResult
284     Returns a decoded representation of the input text.  Must itself not be
285     null, and must point to either null or a valid string.
286 
287     If the decode mechanism is rtl_UriDecodeStrict, and pText cannot be
288     converted to eCharset because it contains (encodings of) unmappable
289     characters (which implies that pText is not empty), then an empty string is
290     returned.
291  */
292 void SAL_CALL rtl_uriDecode(rtl_uString * pText,
293                             rtl_UriDecodeMechanism eMechanism,
294                             rtl_TextEncoding eCharset,
295                             rtl_uString ** pResult)
296     SAL_THROW_EXTERN_C();
297 
298 /** Convert a relative URI reference into an absolute one.
299 
300     A URI reference is a URI plus an optional <"#" fragment> part.
301 
302     This function uses the algorithm described in RFC 2396, section 5.2, with
303     the following clarifications:  (1) Backwards-compatible relative URIs
304     starting with a scheme component (see RFC 2396, section 5.2, step 3) are not
305     supported.  (2) Segments "." and ".." within the path of the base URI are
306     not considered special, RFC 2396 seems a bit unlcear about that point.
307     (3) Erroneous excess segments ".." within the path of the relative URI (if
308     it is indeed relative) are left intact, as the examples in RFC 2396,
309     section C.2, suggest.  (4) If the relative URI is a reference to the
310     "current document," the "current document" is taken to be the base URI.
311 
312     This function signals exceptions by returning false and letting pException
313     point to a message explaining the exception.
314 
315     @param pBaseUriRef
316     An absolute, hierarchical URI reference that serves as the base URI.  If it
317     has to be inspected (i.e., pRelUriRef is not an absolute URI already), and
318     if it either is not an absolute URI (i.e., does not begin with a
319     <scheme ":"> part) or has a path that is non-empty but does not start
320     with "/", an exception will be signaled.
321 
322     @param pRelUriRef
323     An URI reference that may be either absolute or relative.  If it is
324     absolute, it will be returned unmodified (and it need not be hierarchical
325     then).
326 
327     @param pResult
328     Returns an absolute URI reference.  Must itself not be null, and must point
329     to either null or a valid string.  If an exception is signalled, it is left
330     unchanged.
331 
332     @param pException
333     Returns an explanatory message in case an exception is signalled.  Must
334     itself not be null, and must point to either null or a valid string.  If no
335     exception is signalled, it is left unchanged.
336 
337     @return
338     True if no exception is signalled, otherwise false.
339  */
340 sal_Bool SAL_CALL rtl_uriConvertRelToAbs(rtl_uString * pBaseUriRef,
341                                          rtl_uString * pRelUriRef,
342                                          rtl_uString ** pResult,
343                                          rtl_uString ** pException)
344     SAL_THROW_EXTERN_C();
345 
346 #if defined __cplusplus
347 }
348 #endif /* __cplusplus */
349 
350 #endif /* _RTL_URI_H_ */
351