xref: /aoo41x/main/sal/inc/rtl/uri.h (revision cdf0e10c)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 #ifndef _RTL_URI_H_
29 #define _RTL_URI_H_
30 
31 #include "rtl/textenc.h"
32 #include "rtl/ustring.h"
33 #include "sal/types.h"
34 
35 #if defined __cplusplus
36 extern "C" {
37 #endif /* __cplusplus */
38 
39 /**  Various predefined URI 'char classes.'
40 
41      @descr
42      A 'char class' defines which (ASCII) characters can be written 'as they
43      are' in a part of a Uri, and which characters have to be written using
44      escape sequences ('%' followed by two hex digits).  Characters outside
45      the ASCII range are always written using escape sequences.
46 
47      @descr
48      If there are other frequently used char classes, they can be added to
49      this enumeration; the function rtl_getUriCharClass() has to be adapted
50      then, too.
51  */
52 typedef enum
53 {
54     /** The empty char class.
55 
56         @descr
57         All characters are written using escape sequences.
58      */
59     rtl_UriCharClassNone,
60 
61     /** The RFC 2732 <uric> char class.
62 
63         @descr
64         The 'valid' characters are !$&'()*+,-./:;=?@[]_~ plus digits and
65         letters.
66      */
67     rtl_UriCharClassUric,
68 
69     /** The RFC 2396 <uric_no_slash> char class.
70 
71         @descr
72         The 'valid' characters are !$&'()*+,-.:;=?@_~ plus digits and letters.
73      */
74     rtl_UriCharClassUricNoSlash,
75 
76     /** The RFC 2396 <rel_segment> char class.
77 
78         @descr
79         The 'valid' characters are !$&'()*+,-.;=@_~ plus digits and letters.
80      */
81     rtl_UriCharClassRelSegment,
82 
83     /** The RFC 2396 <reg_name> char class.
84 
85         @descr
86         The 'valid' characters are !$&'()*+,-.:;=@_~ plus digits and letters.
87      */
88     rtl_UriCharClassRegName,
89 
90     /** The RFC 2396 <userinfo> char class.
91 
92         @descr
93         The 'valid' characters are !$&'()*+,-.:;=_~ plus digits and letters.
94      */
95     rtl_UriCharClassUserinfo,
96 
97     /** The RFC 2396 <pchar> char class.
98 
99         @descr
100         The 'valid' characters are !$&'()*+,-.:=@_~ plus digits and letters.
101      */
102     rtl_UriCharClassPchar,
103 
104     /** The char class for the values of uno URL parameters.
105 
106         @descr
107         The 'valid' characters are !$&'()*+-./:?@_~ plus digits and letters.
108      */
109     rtl_UriCharClassUnoParamValue,
110 
111     rtl_UriCharClass_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
112 }
113 rtl_UriCharClass;
114 
115 /** The mechanism describing how escape sequences in the input of
116     rtl_uriEncode() are handled.
117  */
118 typedef enum
119 {
120     /** The special meaning of '%' is ignored (i.e., there are by definition
121         no escape sequences in the input).
122 
123         @descr
124         This mechanism is useful to encode user input as part of a URI (e.g.,
125         the user-supplied password in an ftp URL---'%20abcde' is a valid
126         password, so do not assume that the '%20' is an escaped space).
127      */
128     rtl_UriEncodeIgnoreEscapes,
129 
130     /** All escape sequences ('%' followed by two hex digits) are kept intact,
131         even if they represent characters that need not be escaped or if they
132         do not even map to characters in the given charset.
133 
134         @descr
135         This mechanism is useful when passing on complete URIs more or less
136         unmodified (e.g., within an HTTP proxy): missing escape sequences are
137         added, but existing escape sequences are not touched (except that any
138         lower case hex digits are replaced by upper case hex digits).
139      */
140     rtl_UriEncodeKeepEscapes,
141 
142     /** All escape sequences ('%' followed by two hex digits) are resolved in
143         a first step; only those that represent characters that need to be
144         escaped are kept intact.
145 
146         @descr
147         This mechanism is useful to properly encode complete URIs entered by
148         the user: the URI is brought into a 'canonic form,' but care is taken
149         not to damage (valid) escape sequences the (careful) user already
150         entered as such.
151      */
152     rtl_UriEncodeCheckEscapes,
153 
154     /** Like rtl_UriEncodeIgnoreEscapes, but indicating failure when converting
155         unmappable characters.
156 
157         @since UDK 3.2.0
158      */
159     rtl_UriEncodeStrict,
160 
161     /** Like rtl_UriEncodeKeepEscapes, but indicating failure when converting
162         unmappable characters.
163 
164         @since UDK 3.2.7
165      */
166     rtl_UriEncodeStrictKeepEscapes,
167 
168     rtl_UriEncode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
169 }
170 rtl_UriEncodeMechanism;
171 
172 /** The mechanism describing how rtl_uriDecode() translates (part of) a URI
173     into a Unicode string.
174  */
175 typedef enum
176 {
177     /** The text is returned completely unmodified.
178      */
179     rtl_UriDecodeNone,
180 
181     /** The text is returned in the form of an IURI (cf.
182         draft-masinter-url-i18n-05.txt).
183 
184         @descr
185         All escape sequences representing ASCII characters (%00--%7F) are
186         kept, all other escape sequences are interpreted as UTF-8 characters
187         and translated to Unicode, if possible.
188      */
189     rtl_UriDecodeToIuri,
190 
191     /** The text is decoded.
192 
193         @descr
194         All escape sequences representing characters from the given charset
195         are decoded and translated to Unicode, if possible.
196      */
197     rtl_UriDecodeWithCharset,
198 
199     /** Like rtl_UriDecodeWithCharset, but indicating failure when converting
200         unmappable characters.
201 
202         @since UDK 3.2.0
203      */
204     rtl_UriDecodeStrict,
205 
206     rtl_UriDecode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
207 }
208 rtl_UriDecodeMechanism;
209 
210 /** Map a predefined rtl_UriCharClass to a form usable by rtl_uriEncode().
211 
212     @descr
213     The function rtl_uriEncode() expects an array of 128 booleans, and this
214     function maps rtl_UriCharClass enumeration members to such arrays.
215 
216     @param eCharClass
217     Any valid member of rtl_UriCharClass.
218 
219     @return
220     An array of 128 booleans, to be used in calls to rtl_uriEncode().
221  */
222 sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass)
223     SAL_THROW_EXTERN_C();
224 
225 /** Encode a text as (part of) a URI.
226 
227     @param pText
228     Any Unicode string.  Must not be null.
229 
230     @param pCharClass
231     A char class, represented as an array of 128 booleans (true means keep the
232     corresponding ASCII character unencoded, false means encode it).  Must not
233     be null, and the boolean corresponding to the percent sign (0x25) must be
234     false.  (See rtl_getUriCharClass() for a function mapping from
235     rtl_UriCharClass to such arrays.)
236 
237     @param eMechanism
238     The mechanism describing how escape sequences in the input text are
239     handled.
240 
241     @param eCharset
242     When Unicode characters from the input text have to be written using
243     escape sequences (because they are either outside the ASCII range or do
244     not belong to the given char class), they are first translated into this
245     charset before being encoded using escape sequences.
246 
247     Also, if the encode mechanism is rtl_UriEncodeCheckEscapes, all escape
248     sequences already present in the input text are interpreted as characters
249     from this charset.
250 
251     @param pResult
252     Returns an encoded representation of the input text.  Must itself not be
253     null, and must point to either null or a valid string.
254 
255     If the encode mechanism is rtl_UriEncodeStrict, and pText cannot be
256     converted to eCharset because it contains unmappable characters (which
257     implies that pText is not empty), then an empty string is returned.
258  */
259 void SAL_CALL rtl_uriEncode(rtl_uString * pText,
260                             sal_Bool const * pCharClass,
261                             rtl_UriEncodeMechanism eMechanism,
262                             rtl_TextEncoding eCharset,
263                             rtl_uString ** pResult)
264     SAL_THROW_EXTERN_C();
265 
266 /** Decode (a part of) a URI.
267 
268     @param pText
269     Any Unicode string.  Must not be null.  (If the input is indeed part of a
270     valid URI, this string will only contain a subset of the ASCII characters,
271     but this function also handles other Unicode characters properly.)
272 
273     @param eMechanism
274     The mechanism describing how the input text is translated into a Unicode
275     string.
276 
277     @param eCharset
278     When the decode mechanism is rtl_UriDecodeWithCharset, all escape
279     sequences in the input text are interpreted as characters from this
280     charset.  Those characters are translated to Unicode characters in the
281     resulting output, if possible.
282 
283     When the decode mechanism is rtl_UriDecodeNone or rtl_UriDecodeToIuri,
284     this parameter is ignored (and is best specified as
285     RTL_TEXTENCODING_UTF8).
286 
287     @param pResult
288     Returns a decoded representation of the input text.  Must itself not be
289     null, and must point to either null or a valid string.
290 
291     If the decode mechanism is rtl_UriDecodeStrict, and pText cannot be
292     converted to eCharset because it contains (encodings of) unmappable
293     characters (which implies that pText is not empty), then an empty string is
294     returned.
295  */
296 void SAL_CALL rtl_uriDecode(rtl_uString * pText,
297                             rtl_UriDecodeMechanism eMechanism,
298                             rtl_TextEncoding eCharset,
299                             rtl_uString ** pResult)
300     SAL_THROW_EXTERN_C();
301 
302 /** Convert a relative URI reference into an absolute one.
303 
304     A URI reference is a URI plus an optional <"#" fragment> part.
305 
306     This function uses the algorithm described in RFC 2396, section 5.2, with
307     the following clarifications:  (1) Backwards-compatible relative URIs
308     starting with a scheme component (see RFC 2396, section 5.2, step 3) are not
309     supported.  (2) Segments "." and ".." within the path of the base URI are
310     not considered special, RFC 2396 seems a bit unlcear about that point.
311     (3) Erroneous excess segments ".." within the path of the relative URI (if
312     it is indeed relative) are left intact, as the examples in RFC 2396,
313     section C.2, suggest.  (4) If the relative URI is a reference to the
314     "current document," the "current document" is taken to be the base URI.
315 
316     This function signals exceptions by returning false and letting pException
317     point to a message explaining the exception.
318 
319     @param pBaseUriRef
320     An absolute, hierarchical URI reference that serves as the base URI.  If it
321     has to be inspected (i.e., pRelUriRef is not an absolute URI already), and
322     if it either is not an absolute URI (i.e., does not begin with a
323     <scheme ":"> part) or has a path that is non-empty but does not start
324     with "/", an exception will be signaled.
325 
326     @param pRelUriRef
327     An URI reference that may be either absolute or relative.  If it is
328     absolute, it will be returned unmodified (and it need not be hierarchical
329     then).
330 
331     @param pResult
332     Returns an absolute URI reference.  Must itself not be null, and must point
333     to either null or a valid string.  If an exception is signalled, it is left
334     unchanged.
335 
336     @param pException
337     Returns an explanatory message in case an exception is signalled.  Must
338     itself not be null, and must point to either null or a valid string.  If no
339     exception is signalled, it is left unchanged.
340 
341     @return
342     True if no exception is signalled, otherwise false.
343  */
344 sal_Bool SAL_CALL rtl_uriConvertRelToAbs(rtl_uString * pBaseUriRef,
345                                          rtl_uString * pRelUriRef,
346                                          rtl_uString ** pResult,
347                                          rtl_uString ** pException)
348     SAL_THROW_EXTERN_C();
349 
350 #if defined __cplusplus
351 }
352 #endif /* __cplusplus */
353 
354 #endif /* _RTL_URI_H_ */
355