xref: /aoo41x/main/sal/inc/rtl/uri.h (revision 9eab2a37)
1*9eab2a37SAndrew Rist /**************************************************************
2cdf0e10cSrcweir  *
3*9eab2a37SAndrew Rist  * Licensed to the Apache Software Foundation (ASF) under one
4*9eab2a37SAndrew Rist  * or more contributor license agreements.  See the NOTICE file
5*9eab2a37SAndrew Rist  * distributed with this work for additional information
6*9eab2a37SAndrew Rist  * regarding copyright ownership.  The ASF licenses this file
7*9eab2a37SAndrew Rist  * to you under the Apache License, Version 2.0 (the
8*9eab2a37SAndrew Rist  * "License"); you may not use this file except in compliance
9*9eab2a37SAndrew Rist  * with the License.  You may obtain a copy of the License at
10*9eab2a37SAndrew Rist  *
11*9eab2a37SAndrew Rist  *   http://www.apache.org/licenses/LICENSE-2.0
12*9eab2a37SAndrew Rist  *
13*9eab2a37SAndrew Rist  * Unless required by applicable law or agreed to in writing,
14*9eab2a37SAndrew Rist  * software distributed under the License is distributed on an
15*9eab2a37SAndrew Rist  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*9eab2a37SAndrew Rist  * KIND, either express or implied.  See the License for the
17*9eab2a37SAndrew Rist  * specific language governing permissions and limitations
18*9eab2a37SAndrew Rist  * under the License.
19*9eab2a37SAndrew Rist  *
20*9eab2a37SAndrew Rist  *************************************************************/
21*9eab2a37SAndrew Rist 
22*9eab2a37SAndrew Rist 
23cdf0e10cSrcweir 
24cdf0e10cSrcweir #ifndef _RTL_URI_H_
25cdf0e10cSrcweir #define _RTL_URI_H_
26cdf0e10cSrcweir 
27cdf0e10cSrcweir #include "rtl/textenc.h"
28cdf0e10cSrcweir #include "rtl/ustring.h"
29cdf0e10cSrcweir #include "sal/types.h"
30cdf0e10cSrcweir 
31cdf0e10cSrcweir #if defined __cplusplus
32cdf0e10cSrcweir extern "C" {
33cdf0e10cSrcweir #endif /* __cplusplus */
34cdf0e10cSrcweir 
35cdf0e10cSrcweir /**  Various predefined URI 'char classes.'
36cdf0e10cSrcweir 
37cdf0e10cSrcweir      @descr
38cdf0e10cSrcweir      A 'char class' defines which (ASCII) characters can be written 'as they
39cdf0e10cSrcweir      are' in a part of a Uri, and which characters have to be written using
40cdf0e10cSrcweir      escape sequences ('%' followed by two hex digits).  Characters outside
41cdf0e10cSrcweir      the ASCII range are always written using escape sequences.
42cdf0e10cSrcweir 
43cdf0e10cSrcweir      @descr
44cdf0e10cSrcweir      If there are other frequently used char classes, they can be added to
45cdf0e10cSrcweir      this enumeration; the function rtl_getUriCharClass() has to be adapted
46cdf0e10cSrcweir      then, too.
47cdf0e10cSrcweir  */
48cdf0e10cSrcweir typedef enum
49cdf0e10cSrcweir {
50cdf0e10cSrcweir     /** The empty char class.
51cdf0e10cSrcweir 
52cdf0e10cSrcweir         @descr
53cdf0e10cSrcweir         All characters are written using escape sequences.
54cdf0e10cSrcweir      */
55cdf0e10cSrcweir     rtl_UriCharClassNone,
56cdf0e10cSrcweir 
57cdf0e10cSrcweir     /** The RFC 2732 <uric> char class.
58cdf0e10cSrcweir 
59cdf0e10cSrcweir         @descr
60cdf0e10cSrcweir         The 'valid' characters are !$&'()*+,-./:;=?@[]_~ plus digits and
61cdf0e10cSrcweir         letters.
62cdf0e10cSrcweir      */
63cdf0e10cSrcweir     rtl_UriCharClassUric,
64cdf0e10cSrcweir 
65cdf0e10cSrcweir     /** The RFC 2396 <uric_no_slash> char class.
66cdf0e10cSrcweir 
67cdf0e10cSrcweir         @descr
68cdf0e10cSrcweir         The 'valid' characters are !$&'()*+,-.:;=?@_~ plus digits and letters.
69cdf0e10cSrcweir      */
70cdf0e10cSrcweir     rtl_UriCharClassUricNoSlash,
71cdf0e10cSrcweir 
72cdf0e10cSrcweir     /** The RFC 2396 <rel_segment> char class.
73cdf0e10cSrcweir 
74cdf0e10cSrcweir         @descr
75cdf0e10cSrcweir         The 'valid' characters are !$&'()*+,-.;=@_~ plus digits and letters.
76cdf0e10cSrcweir      */
77cdf0e10cSrcweir     rtl_UriCharClassRelSegment,
78cdf0e10cSrcweir 
79cdf0e10cSrcweir     /** The RFC 2396 <reg_name> char class.
80cdf0e10cSrcweir 
81cdf0e10cSrcweir         @descr
82cdf0e10cSrcweir         The 'valid' characters are !$&'()*+,-.:;=@_~ plus digits and letters.
83cdf0e10cSrcweir      */
84cdf0e10cSrcweir     rtl_UriCharClassRegName,
85cdf0e10cSrcweir 
86cdf0e10cSrcweir     /** The RFC 2396 <userinfo> char class.
87cdf0e10cSrcweir 
88cdf0e10cSrcweir         @descr
89cdf0e10cSrcweir         The 'valid' characters are !$&'()*+,-.:;=_~ plus digits and letters.
90cdf0e10cSrcweir      */
91cdf0e10cSrcweir     rtl_UriCharClassUserinfo,
92cdf0e10cSrcweir 
93cdf0e10cSrcweir     /** The RFC 2396 <pchar> char class.
94cdf0e10cSrcweir 
95cdf0e10cSrcweir         @descr
96cdf0e10cSrcweir         The 'valid' characters are !$&'()*+,-.:=@_~ plus digits and letters.
97cdf0e10cSrcweir      */
98cdf0e10cSrcweir     rtl_UriCharClassPchar,
99cdf0e10cSrcweir 
100cdf0e10cSrcweir     /** The char class for the values of uno URL parameters.
101cdf0e10cSrcweir 
102cdf0e10cSrcweir         @descr
103cdf0e10cSrcweir         The 'valid' characters are !$&'()*+-./:?@_~ plus digits and letters.
104cdf0e10cSrcweir      */
105cdf0e10cSrcweir     rtl_UriCharClassUnoParamValue,
106cdf0e10cSrcweir 
107cdf0e10cSrcweir     rtl_UriCharClass_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
108cdf0e10cSrcweir }
109cdf0e10cSrcweir rtl_UriCharClass;
110cdf0e10cSrcweir 
111cdf0e10cSrcweir /** The mechanism describing how escape sequences in the input of
112cdf0e10cSrcweir     rtl_uriEncode() are handled.
113cdf0e10cSrcweir  */
114cdf0e10cSrcweir typedef enum
115cdf0e10cSrcweir {
116cdf0e10cSrcweir     /** The special meaning of '%' is ignored (i.e., there are by definition
117cdf0e10cSrcweir         no escape sequences in the input).
118cdf0e10cSrcweir 
119cdf0e10cSrcweir         @descr
120cdf0e10cSrcweir         This mechanism is useful to encode user input as part of a URI (e.g.,
121cdf0e10cSrcweir         the user-supplied password in an ftp URL---'%20abcde' is a valid
122cdf0e10cSrcweir         password, so do not assume that the '%20' is an escaped space).
123cdf0e10cSrcweir      */
124cdf0e10cSrcweir     rtl_UriEncodeIgnoreEscapes,
125cdf0e10cSrcweir 
126cdf0e10cSrcweir     /** All escape sequences ('%' followed by two hex digits) are kept intact,
127cdf0e10cSrcweir         even if they represent characters that need not be escaped or if they
128cdf0e10cSrcweir         do not even map to characters in the given charset.
129cdf0e10cSrcweir 
130cdf0e10cSrcweir         @descr
131cdf0e10cSrcweir         This mechanism is useful when passing on complete URIs more or less
132cdf0e10cSrcweir         unmodified (e.g., within an HTTP proxy): missing escape sequences are
133cdf0e10cSrcweir         added, but existing escape sequences are not touched (except that any
134cdf0e10cSrcweir         lower case hex digits are replaced by upper case hex digits).
135cdf0e10cSrcweir      */
136cdf0e10cSrcweir     rtl_UriEncodeKeepEscapes,
137cdf0e10cSrcweir 
138cdf0e10cSrcweir     /** All escape sequences ('%' followed by two hex digits) are resolved in
139cdf0e10cSrcweir         a first step; only those that represent characters that need to be
140cdf0e10cSrcweir         escaped are kept intact.
141cdf0e10cSrcweir 
142cdf0e10cSrcweir         @descr
143cdf0e10cSrcweir         This mechanism is useful to properly encode complete URIs entered by
144cdf0e10cSrcweir         the user: the URI is brought into a 'canonic form,' but care is taken
145cdf0e10cSrcweir         not to damage (valid) escape sequences the (careful) user already
146cdf0e10cSrcweir         entered as such.
147cdf0e10cSrcweir      */
148cdf0e10cSrcweir     rtl_UriEncodeCheckEscapes,
149cdf0e10cSrcweir 
150cdf0e10cSrcweir     /** Like rtl_UriEncodeIgnoreEscapes, but indicating failure when converting
151cdf0e10cSrcweir         unmappable characters.
152cdf0e10cSrcweir 
153cdf0e10cSrcweir         @since UDK 3.2.0
154cdf0e10cSrcweir      */
155cdf0e10cSrcweir     rtl_UriEncodeStrict,
156cdf0e10cSrcweir 
157cdf0e10cSrcweir     /** Like rtl_UriEncodeKeepEscapes, but indicating failure when converting
158cdf0e10cSrcweir         unmappable characters.
159cdf0e10cSrcweir 
160cdf0e10cSrcweir         @since UDK 3.2.7
161cdf0e10cSrcweir      */
162cdf0e10cSrcweir     rtl_UriEncodeStrictKeepEscapes,
163cdf0e10cSrcweir 
164cdf0e10cSrcweir     rtl_UriEncode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
165cdf0e10cSrcweir }
166cdf0e10cSrcweir rtl_UriEncodeMechanism;
167cdf0e10cSrcweir 
168cdf0e10cSrcweir /** The mechanism describing how rtl_uriDecode() translates (part of) a URI
169cdf0e10cSrcweir     into a Unicode string.
170cdf0e10cSrcweir  */
171cdf0e10cSrcweir typedef enum
172cdf0e10cSrcweir {
173cdf0e10cSrcweir     /** The text is returned completely unmodified.
174cdf0e10cSrcweir      */
175cdf0e10cSrcweir     rtl_UriDecodeNone,
176cdf0e10cSrcweir 
177cdf0e10cSrcweir     /** The text is returned in the form of an IURI (cf.
178cdf0e10cSrcweir         draft-masinter-url-i18n-05.txt).
179cdf0e10cSrcweir 
180cdf0e10cSrcweir         @descr
181cdf0e10cSrcweir         All escape sequences representing ASCII characters (%00--%7F) are
182cdf0e10cSrcweir         kept, all other escape sequences are interpreted as UTF-8 characters
183cdf0e10cSrcweir         and translated to Unicode, if possible.
184cdf0e10cSrcweir      */
185cdf0e10cSrcweir     rtl_UriDecodeToIuri,
186cdf0e10cSrcweir 
187cdf0e10cSrcweir     /** The text is decoded.
188cdf0e10cSrcweir 
189cdf0e10cSrcweir         @descr
190cdf0e10cSrcweir         All escape sequences representing characters from the given charset
191cdf0e10cSrcweir         are decoded and translated to Unicode, if possible.
192cdf0e10cSrcweir      */
193cdf0e10cSrcweir     rtl_UriDecodeWithCharset,
194cdf0e10cSrcweir 
195cdf0e10cSrcweir     /** Like rtl_UriDecodeWithCharset, but indicating failure when converting
196cdf0e10cSrcweir         unmappable characters.
197cdf0e10cSrcweir 
198cdf0e10cSrcweir         @since UDK 3.2.0
199cdf0e10cSrcweir      */
200cdf0e10cSrcweir     rtl_UriDecodeStrict,
201cdf0e10cSrcweir 
202cdf0e10cSrcweir     rtl_UriDecode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
203cdf0e10cSrcweir }
204cdf0e10cSrcweir rtl_UriDecodeMechanism;
205cdf0e10cSrcweir 
206cdf0e10cSrcweir /** Map a predefined rtl_UriCharClass to a form usable by rtl_uriEncode().
207cdf0e10cSrcweir 
208cdf0e10cSrcweir     @descr
209cdf0e10cSrcweir     The function rtl_uriEncode() expects an array of 128 booleans, and this
210cdf0e10cSrcweir     function maps rtl_UriCharClass enumeration members to such arrays.
211cdf0e10cSrcweir 
212cdf0e10cSrcweir     @param eCharClass
213cdf0e10cSrcweir     Any valid member of rtl_UriCharClass.
214cdf0e10cSrcweir 
215cdf0e10cSrcweir     @return
216cdf0e10cSrcweir     An array of 128 booleans, to be used in calls to rtl_uriEncode().
217cdf0e10cSrcweir  */
218cdf0e10cSrcweir sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass)
219cdf0e10cSrcweir     SAL_THROW_EXTERN_C();
220cdf0e10cSrcweir 
221cdf0e10cSrcweir /** Encode a text as (part of) a URI.
222cdf0e10cSrcweir 
223cdf0e10cSrcweir     @param pText
224cdf0e10cSrcweir     Any Unicode string.  Must not be null.
225cdf0e10cSrcweir 
226cdf0e10cSrcweir     @param pCharClass
227cdf0e10cSrcweir     A char class, represented as an array of 128 booleans (true means keep the
228cdf0e10cSrcweir     corresponding ASCII character unencoded, false means encode it).  Must not
229cdf0e10cSrcweir     be null, and the boolean corresponding to the percent sign (0x25) must be
230cdf0e10cSrcweir     false.  (See rtl_getUriCharClass() for a function mapping from
231cdf0e10cSrcweir     rtl_UriCharClass to such arrays.)
232cdf0e10cSrcweir 
233cdf0e10cSrcweir     @param eMechanism
234cdf0e10cSrcweir     The mechanism describing how escape sequences in the input text are
235cdf0e10cSrcweir     handled.
236cdf0e10cSrcweir 
237cdf0e10cSrcweir     @param eCharset
238cdf0e10cSrcweir     When Unicode characters from the input text have to be written using
239cdf0e10cSrcweir     escape sequences (because they are either outside the ASCII range or do
240cdf0e10cSrcweir     not belong to the given char class), they are first translated into this
241cdf0e10cSrcweir     charset before being encoded using escape sequences.
242cdf0e10cSrcweir 
243cdf0e10cSrcweir     Also, if the encode mechanism is rtl_UriEncodeCheckEscapes, all escape
244cdf0e10cSrcweir     sequences already present in the input text are interpreted as characters
245cdf0e10cSrcweir     from this charset.
246cdf0e10cSrcweir 
247cdf0e10cSrcweir     @param pResult
248cdf0e10cSrcweir     Returns an encoded representation of the input text.  Must itself not be
249cdf0e10cSrcweir     null, and must point to either null or a valid string.
250cdf0e10cSrcweir 
251cdf0e10cSrcweir     If the encode mechanism is rtl_UriEncodeStrict, and pText cannot be
252cdf0e10cSrcweir     converted to eCharset because it contains unmappable characters (which
253cdf0e10cSrcweir     implies that pText is not empty), then an empty string is returned.
254cdf0e10cSrcweir  */
255cdf0e10cSrcweir void SAL_CALL rtl_uriEncode(rtl_uString * pText,
256cdf0e10cSrcweir                             sal_Bool const * pCharClass,
257cdf0e10cSrcweir                             rtl_UriEncodeMechanism eMechanism,
258cdf0e10cSrcweir                             rtl_TextEncoding eCharset,
259cdf0e10cSrcweir                             rtl_uString ** pResult)
260cdf0e10cSrcweir     SAL_THROW_EXTERN_C();
261cdf0e10cSrcweir 
262cdf0e10cSrcweir /** Decode (a part of) a URI.
263cdf0e10cSrcweir 
264cdf0e10cSrcweir     @param pText
265cdf0e10cSrcweir     Any Unicode string.  Must not be null.  (If the input is indeed part of a
266cdf0e10cSrcweir     valid URI, this string will only contain a subset of the ASCII characters,
267cdf0e10cSrcweir     but this function also handles other Unicode characters properly.)
268cdf0e10cSrcweir 
269cdf0e10cSrcweir     @param eMechanism
270cdf0e10cSrcweir     The mechanism describing how the input text is translated into a Unicode
271cdf0e10cSrcweir     string.
272cdf0e10cSrcweir 
273cdf0e10cSrcweir     @param eCharset
274cdf0e10cSrcweir     When the decode mechanism is rtl_UriDecodeWithCharset, all escape
275cdf0e10cSrcweir     sequences in the input text are interpreted as characters from this
276cdf0e10cSrcweir     charset.  Those characters are translated to Unicode characters in the
277cdf0e10cSrcweir     resulting output, if possible.
278cdf0e10cSrcweir 
279cdf0e10cSrcweir     When the decode mechanism is rtl_UriDecodeNone or rtl_UriDecodeToIuri,
280cdf0e10cSrcweir     this parameter is ignored (and is best specified as
281cdf0e10cSrcweir     RTL_TEXTENCODING_UTF8).
282cdf0e10cSrcweir 
283cdf0e10cSrcweir     @param pResult
284cdf0e10cSrcweir     Returns a decoded representation of the input text.  Must itself not be
285cdf0e10cSrcweir     null, and must point to either null or a valid string.
286cdf0e10cSrcweir 
287cdf0e10cSrcweir     If the decode mechanism is rtl_UriDecodeStrict, and pText cannot be
288cdf0e10cSrcweir     converted to eCharset because it contains (encodings of) unmappable
289cdf0e10cSrcweir     characters (which implies that pText is not empty), then an empty string is
290cdf0e10cSrcweir     returned.
291cdf0e10cSrcweir  */
292cdf0e10cSrcweir void SAL_CALL rtl_uriDecode(rtl_uString * pText,
293cdf0e10cSrcweir                             rtl_UriDecodeMechanism eMechanism,
294cdf0e10cSrcweir                             rtl_TextEncoding eCharset,
295cdf0e10cSrcweir                             rtl_uString ** pResult)
296cdf0e10cSrcweir     SAL_THROW_EXTERN_C();
297cdf0e10cSrcweir 
298cdf0e10cSrcweir /** Convert a relative URI reference into an absolute one.
299cdf0e10cSrcweir 
300cdf0e10cSrcweir     A URI reference is a URI plus an optional <"#" fragment> part.
301cdf0e10cSrcweir 
302cdf0e10cSrcweir     This function uses the algorithm described in RFC 2396, section 5.2, with
303cdf0e10cSrcweir     the following clarifications:  (1) Backwards-compatible relative URIs
304cdf0e10cSrcweir     starting with a scheme component (see RFC 2396, section 5.2, step 3) are not
305cdf0e10cSrcweir     supported.  (2) Segments "." and ".." within the path of the base URI are
306cdf0e10cSrcweir     not considered special, RFC 2396 seems a bit unlcear about that point.
307cdf0e10cSrcweir     (3) Erroneous excess segments ".." within the path of the relative URI (if
308cdf0e10cSrcweir     it is indeed relative) are left intact, as the examples in RFC 2396,
309cdf0e10cSrcweir     section C.2, suggest.  (4) If the relative URI is a reference to the
310cdf0e10cSrcweir     "current document," the "current document" is taken to be the base URI.
311cdf0e10cSrcweir 
312cdf0e10cSrcweir     This function signals exceptions by returning false and letting pException
313cdf0e10cSrcweir     point to a message explaining the exception.
314cdf0e10cSrcweir 
315cdf0e10cSrcweir     @param pBaseUriRef
316cdf0e10cSrcweir     An absolute, hierarchical URI reference that serves as the base URI.  If it
317cdf0e10cSrcweir     has to be inspected (i.e., pRelUriRef is not an absolute URI already), and
318cdf0e10cSrcweir     if it either is not an absolute URI (i.e., does not begin with a
319cdf0e10cSrcweir     <scheme ":"> part) or has a path that is non-empty but does not start
320cdf0e10cSrcweir     with "/", an exception will be signaled.
321cdf0e10cSrcweir 
322cdf0e10cSrcweir     @param pRelUriRef
323cdf0e10cSrcweir     An URI reference that may be either absolute or relative.  If it is
324cdf0e10cSrcweir     absolute, it will be returned unmodified (and it need not be hierarchical
325cdf0e10cSrcweir     then).
326cdf0e10cSrcweir 
327cdf0e10cSrcweir     @param pResult
328cdf0e10cSrcweir     Returns an absolute URI reference.  Must itself not be null, and must point
329cdf0e10cSrcweir     to either null or a valid string.  If an exception is signalled, it is left
330cdf0e10cSrcweir     unchanged.
331cdf0e10cSrcweir 
332cdf0e10cSrcweir     @param pException
333cdf0e10cSrcweir     Returns an explanatory message in case an exception is signalled.  Must
334cdf0e10cSrcweir     itself not be null, and must point to either null or a valid string.  If no
335cdf0e10cSrcweir     exception is signalled, it is left unchanged.
336cdf0e10cSrcweir 
337cdf0e10cSrcweir     @return
338cdf0e10cSrcweir     True if no exception is signalled, otherwise false.
339cdf0e10cSrcweir  */
340cdf0e10cSrcweir sal_Bool SAL_CALL rtl_uriConvertRelToAbs(rtl_uString * pBaseUriRef,
341cdf0e10cSrcweir                                          rtl_uString * pRelUriRef,
342cdf0e10cSrcweir                                          rtl_uString ** pResult,
343cdf0e10cSrcweir                                          rtl_uString ** pException)
344cdf0e10cSrcweir     SAL_THROW_EXTERN_C();
345cdf0e10cSrcweir 
346cdf0e10cSrcweir #if defined __cplusplus
347cdf0e10cSrcweir }
348cdf0e10cSrcweir #endif /* __cplusplus */
349cdf0e10cSrcweir 
350cdf0e10cSrcweir #endif /* _RTL_URI_H_ */
351