xref: /aoo42x/main/sal/inc/rtl/uri.h (revision cdf0e10c)
1*cdf0e10cSrcweir /*************************************************************************
2*cdf0e10cSrcweir  *
3*cdf0e10cSrcweir  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4*cdf0e10cSrcweir  *
5*cdf0e10cSrcweir  * Copyright 2000, 2010 Oracle and/or its affiliates.
6*cdf0e10cSrcweir  *
7*cdf0e10cSrcweir  * OpenOffice.org - a multi-platform office productivity suite
8*cdf0e10cSrcweir  *
9*cdf0e10cSrcweir  * This file is part of OpenOffice.org.
10*cdf0e10cSrcweir  *
11*cdf0e10cSrcweir  * OpenOffice.org is free software: you can redistribute it and/or modify
12*cdf0e10cSrcweir  * it under the terms of the GNU Lesser General Public License version 3
13*cdf0e10cSrcweir  * only, as published by the Free Software Foundation.
14*cdf0e10cSrcweir  *
15*cdf0e10cSrcweir  * OpenOffice.org is distributed in the hope that it will be useful,
16*cdf0e10cSrcweir  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17*cdf0e10cSrcweir  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18*cdf0e10cSrcweir  * GNU Lesser General Public License version 3 for more details
19*cdf0e10cSrcweir  * (a copy is included in the LICENSE file that accompanied this code).
20*cdf0e10cSrcweir  *
21*cdf0e10cSrcweir  * You should have received a copy of the GNU Lesser General Public License
22*cdf0e10cSrcweir  * version 3 along with OpenOffice.org.  If not, see
23*cdf0e10cSrcweir  * <http://www.openoffice.org/license.html>
24*cdf0e10cSrcweir  * for a copy of the LGPLv3 License.
25*cdf0e10cSrcweir  *
26*cdf0e10cSrcweir  ************************************************************************/
27*cdf0e10cSrcweir 
28*cdf0e10cSrcweir #ifndef _RTL_URI_H_
29*cdf0e10cSrcweir #define _RTL_URI_H_
30*cdf0e10cSrcweir 
31*cdf0e10cSrcweir #include "rtl/textenc.h"
32*cdf0e10cSrcweir #include "rtl/ustring.h"
33*cdf0e10cSrcweir #include "sal/types.h"
34*cdf0e10cSrcweir 
35*cdf0e10cSrcweir #if defined __cplusplus
36*cdf0e10cSrcweir extern "C" {
37*cdf0e10cSrcweir #endif /* __cplusplus */
38*cdf0e10cSrcweir 
39*cdf0e10cSrcweir /**  Various predefined URI 'char classes.'
40*cdf0e10cSrcweir 
41*cdf0e10cSrcweir      @descr
42*cdf0e10cSrcweir      A 'char class' defines which (ASCII) characters can be written 'as they
43*cdf0e10cSrcweir      are' in a part of a Uri, and which characters have to be written using
44*cdf0e10cSrcweir      escape sequences ('%' followed by two hex digits).  Characters outside
45*cdf0e10cSrcweir      the ASCII range are always written using escape sequences.
46*cdf0e10cSrcweir 
47*cdf0e10cSrcweir      @descr
48*cdf0e10cSrcweir      If there are other frequently used char classes, they can be added to
49*cdf0e10cSrcweir      this enumeration; the function rtl_getUriCharClass() has to be adapted
50*cdf0e10cSrcweir      then, too.
51*cdf0e10cSrcweir  */
52*cdf0e10cSrcweir typedef enum
53*cdf0e10cSrcweir {
54*cdf0e10cSrcweir     /** The empty char class.
55*cdf0e10cSrcweir 
56*cdf0e10cSrcweir         @descr
57*cdf0e10cSrcweir         All characters are written using escape sequences.
58*cdf0e10cSrcweir      */
59*cdf0e10cSrcweir     rtl_UriCharClassNone,
60*cdf0e10cSrcweir 
61*cdf0e10cSrcweir     /** The RFC 2732 <uric> char class.
62*cdf0e10cSrcweir 
63*cdf0e10cSrcweir         @descr
64*cdf0e10cSrcweir         The 'valid' characters are !$&'()*+,-./:;=?@[]_~ plus digits and
65*cdf0e10cSrcweir         letters.
66*cdf0e10cSrcweir      */
67*cdf0e10cSrcweir     rtl_UriCharClassUric,
68*cdf0e10cSrcweir 
69*cdf0e10cSrcweir     /** The RFC 2396 <uric_no_slash> char class.
70*cdf0e10cSrcweir 
71*cdf0e10cSrcweir         @descr
72*cdf0e10cSrcweir         The 'valid' characters are !$&'()*+,-.:;=?@_~ plus digits and letters.
73*cdf0e10cSrcweir      */
74*cdf0e10cSrcweir     rtl_UriCharClassUricNoSlash,
75*cdf0e10cSrcweir 
76*cdf0e10cSrcweir     /** The RFC 2396 <rel_segment> char class.
77*cdf0e10cSrcweir 
78*cdf0e10cSrcweir         @descr
79*cdf0e10cSrcweir         The 'valid' characters are !$&'()*+,-.;=@_~ plus digits and letters.
80*cdf0e10cSrcweir      */
81*cdf0e10cSrcweir     rtl_UriCharClassRelSegment,
82*cdf0e10cSrcweir 
83*cdf0e10cSrcweir     /** The RFC 2396 <reg_name> char class.
84*cdf0e10cSrcweir 
85*cdf0e10cSrcweir         @descr
86*cdf0e10cSrcweir         The 'valid' characters are !$&'()*+,-.:;=@_~ plus digits and letters.
87*cdf0e10cSrcweir      */
88*cdf0e10cSrcweir     rtl_UriCharClassRegName,
89*cdf0e10cSrcweir 
90*cdf0e10cSrcweir     /** The RFC 2396 <userinfo> char class.
91*cdf0e10cSrcweir 
92*cdf0e10cSrcweir         @descr
93*cdf0e10cSrcweir         The 'valid' characters are !$&'()*+,-.:;=_~ plus digits and letters.
94*cdf0e10cSrcweir      */
95*cdf0e10cSrcweir     rtl_UriCharClassUserinfo,
96*cdf0e10cSrcweir 
97*cdf0e10cSrcweir     /** The RFC 2396 <pchar> char class.
98*cdf0e10cSrcweir 
99*cdf0e10cSrcweir         @descr
100*cdf0e10cSrcweir         The 'valid' characters are !$&'()*+,-.:=@_~ plus digits and letters.
101*cdf0e10cSrcweir      */
102*cdf0e10cSrcweir     rtl_UriCharClassPchar,
103*cdf0e10cSrcweir 
104*cdf0e10cSrcweir     /** The char class for the values of uno URL parameters.
105*cdf0e10cSrcweir 
106*cdf0e10cSrcweir         @descr
107*cdf0e10cSrcweir         The 'valid' characters are !$&'()*+-./:?@_~ plus digits and letters.
108*cdf0e10cSrcweir      */
109*cdf0e10cSrcweir     rtl_UriCharClassUnoParamValue,
110*cdf0e10cSrcweir 
111*cdf0e10cSrcweir     rtl_UriCharClass_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
112*cdf0e10cSrcweir }
113*cdf0e10cSrcweir rtl_UriCharClass;
114*cdf0e10cSrcweir 
115*cdf0e10cSrcweir /** The mechanism describing how escape sequences in the input of
116*cdf0e10cSrcweir     rtl_uriEncode() are handled.
117*cdf0e10cSrcweir  */
118*cdf0e10cSrcweir typedef enum
119*cdf0e10cSrcweir {
120*cdf0e10cSrcweir     /** The special meaning of '%' is ignored (i.e., there are by definition
121*cdf0e10cSrcweir         no escape sequences in the input).
122*cdf0e10cSrcweir 
123*cdf0e10cSrcweir         @descr
124*cdf0e10cSrcweir         This mechanism is useful to encode user input as part of a URI (e.g.,
125*cdf0e10cSrcweir         the user-supplied password in an ftp URL---'%20abcde' is a valid
126*cdf0e10cSrcweir         password, so do not assume that the '%20' is an escaped space).
127*cdf0e10cSrcweir      */
128*cdf0e10cSrcweir     rtl_UriEncodeIgnoreEscapes,
129*cdf0e10cSrcweir 
130*cdf0e10cSrcweir     /** All escape sequences ('%' followed by two hex digits) are kept intact,
131*cdf0e10cSrcweir         even if they represent characters that need not be escaped or if they
132*cdf0e10cSrcweir         do not even map to characters in the given charset.
133*cdf0e10cSrcweir 
134*cdf0e10cSrcweir         @descr
135*cdf0e10cSrcweir         This mechanism is useful when passing on complete URIs more or less
136*cdf0e10cSrcweir         unmodified (e.g., within an HTTP proxy): missing escape sequences are
137*cdf0e10cSrcweir         added, but existing escape sequences are not touched (except that any
138*cdf0e10cSrcweir         lower case hex digits are replaced by upper case hex digits).
139*cdf0e10cSrcweir      */
140*cdf0e10cSrcweir     rtl_UriEncodeKeepEscapes,
141*cdf0e10cSrcweir 
142*cdf0e10cSrcweir     /** All escape sequences ('%' followed by two hex digits) are resolved in
143*cdf0e10cSrcweir         a first step; only those that represent characters that need to be
144*cdf0e10cSrcweir         escaped are kept intact.
145*cdf0e10cSrcweir 
146*cdf0e10cSrcweir         @descr
147*cdf0e10cSrcweir         This mechanism is useful to properly encode complete URIs entered by
148*cdf0e10cSrcweir         the user: the URI is brought into a 'canonic form,' but care is taken
149*cdf0e10cSrcweir         not to damage (valid) escape sequences the (careful) user already
150*cdf0e10cSrcweir         entered as such.
151*cdf0e10cSrcweir      */
152*cdf0e10cSrcweir     rtl_UriEncodeCheckEscapes,
153*cdf0e10cSrcweir 
154*cdf0e10cSrcweir     /** Like rtl_UriEncodeIgnoreEscapes, but indicating failure when converting
155*cdf0e10cSrcweir         unmappable characters.
156*cdf0e10cSrcweir 
157*cdf0e10cSrcweir         @since UDK 3.2.0
158*cdf0e10cSrcweir      */
159*cdf0e10cSrcweir     rtl_UriEncodeStrict,
160*cdf0e10cSrcweir 
161*cdf0e10cSrcweir     /** Like rtl_UriEncodeKeepEscapes, but indicating failure when converting
162*cdf0e10cSrcweir         unmappable characters.
163*cdf0e10cSrcweir 
164*cdf0e10cSrcweir         @since UDK 3.2.7
165*cdf0e10cSrcweir      */
166*cdf0e10cSrcweir     rtl_UriEncodeStrictKeepEscapes,
167*cdf0e10cSrcweir 
168*cdf0e10cSrcweir     rtl_UriEncode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
169*cdf0e10cSrcweir }
170*cdf0e10cSrcweir rtl_UriEncodeMechanism;
171*cdf0e10cSrcweir 
172*cdf0e10cSrcweir /** The mechanism describing how rtl_uriDecode() translates (part of) a URI
173*cdf0e10cSrcweir     into a Unicode string.
174*cdf0e10cSrcweir  */
175*cdf0e10cSrcweir typedef enum
176*cdf0e10cSrcweir {
177*cdf0e10cSrcweir     /** The text is returned completely unmodified.
178*cdf0e10cSrcweir      */
179*cdf0e10cSrcweir     rtl_UriDecodeNone,
180*cdf0e10cSrcweir 
181*cdf0e10cSrcweir     /** The text is returned in the form of an IURI (cf.
182*cdf0e10cSrcweir         draft-masinter-url-i18n-05.txt).
183*cdf0e10cSrcweir 
184*cdf0e10cSrcweir         @descr
185*cdf0e10cSrcweir         All escape sequences representing ASCII characters (%00--%7F) are
186*cdf0e10cSrcweir         kept, all other escape sequences are interpreted as UTF-8 characters
187*cdf0e10cSrcweir         and translated to Unicode, if possible.
188*cdf0e10cSrcweir      */
189*cdf0e10cSrcweir     rtl_UriDecodeToIuri,
190*cdf0e10cSrcweir 
191*cdf0e10cSrcweir     /** The text is decoded.
192*cdf0e10cSrcweir 
193*cdf0e10cSrcweir         @descr
194*cdf0e10cSrcweir         All escape sequences representing characters from the given charset
195*cdf0e10cSrcweir         are decoded and translated to Unicode, if possible.
196*cdf0e10cSrcweir      */
197*cdf0e10cSrcweir     rtl_UriDecodeWithCharset,
198*cdf0e10cSrcweir 
199*cdf0e10cSrcweir     /** Like rtl_UriDecodeWithCharset, but indicating failure when converting
200*cdf0e10cSrcweir         unmappable characters.
201*cdf0e10cSrcweir 
202*cdf0e10cSrcweir         @since UDK 3.2.0
203*cdf0e10cSrcweir      */
204*cdf0e10cSrcweir     rtl_UriDecodeStrict,
205*cdf0e10cSrcweir 
206*cdf0e10cSrcweir     rtl_UriDecode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM
207*cdf0e10cSrcweir }
208*cdf0e10cSrcweir rtl_UriDecodeMechanism;
209*cdf0e10cSrcweir 
210*cdf0e10cSrcweir /** Map a predefined rtl_UriCharClass to a form usable by rtl_uriEncode().
211*cdf0e10cSrcweir 
212*cdf0e10cSrcweir     @descr
213*cdf0e10cSrcweir     The function rtl_uriEncode() expects an array of 128 booleans, and this
214*cdf0e10cSrcweir     function maps rtl_UriCharClass enumeration members to such arrays.
215*cdf0e10cSrcweir 
216*cdf0e10cSrcweir     @param eCharClass
217*cdf0e10cSrcweir     Any valid member of rtl_UriCharClass.
218*cdf0e10cSrcweir 
219*cdf0e10cSrcweir     @return
220*cdf0e10cSrcweir     An array of 128 booleans, to be used in calls to rtl_uriEncode().
221*cdf0e10cSrcweir  */
222*cdf0e10cSrcweir sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass)
223*cdf0e10cSrcweir     SAL_THROW_EXTERN_C();
224*cdf0e10cSrcweir 
225*cdf0e10cSrcweir /** Encode a text as (part of) a URI.
226*cdf0e10cSrcweir 
227*cdf0e10cSrcweir     @param pText
228*cdf0e10cSrcweir     Any Unicode string.  Must not be null.
229*cdf0e10cSrcweir 
230*cdf0e10cSrcweir     @param pCharClass
231*cdf0e10cSrcweir     A char class, represented as an array of 128 booleans (true means keep the
232*cdf0e10cSrcweir     corresponding ASCII character unencoded, false means encode it).  Must not
233*cdf0e10cSrcweir     be null, and the boolean corresponding to the percent sign (0x25) must be
234*cdf0e10cSrcweir     false.  (See rtl_getUriCharClass() for a function mapping from
235*cdf0e10cSrcweir     rtl_UriCharClass to such arrays.)
236*cdf0e10cSrcweir 
237*cdf0e10cSrcweir     @param eMechanism
238*cdf0e10cSrcweir     The mechanism describing how escape sequences in the input text are
239*cdf0e10cSrcweir     handled.
240*cdf0e10cSrcweir 
241*cdf0e10cSrcweir     @param eCharset
242*cdf0e10cSrcweir     When Unicode characters from the input text have to be written using
243*cdf0e10cSrcweir     escape sequences (because they are either outside the ASCII range or do
244*cdf0e10cSrcweir     not belong to the given char class), they are first translated into this
245*cdf0e10cSrcweir     charset before being encoded using escape sequences.
246*cdf0e10cSrcweir 
247*cdf0e10cSrcweir     Also, if the encode mechanism is rtl_UriEncodeCheckEscapes, all escape
248*cdf0e10cSrcweir     sequences already present in the input text are interpreted as characters
249*cdf0e10cSrcweir     from this charset.
250*cdf0e10cSrcweir 
251*cdf0e10cSrcweir     @param pResult
252*cdf0e10cSrcweir     Returns an encoded representation of the input text.  Must itself not be
253*cdf0e10cSrcweir     null, and must point to either null or a valid string.
254*cdf0e10cSrcweir 
255*cdf0e10cSrcweir     If the encode mechanism is rtl_UriEncodeStrict, and pText cannot be
256*cdf0e10cSrcweir     converted to eCharset because it contains unmappable characters (which
257*cdf0e10cSrcweir     implies that pText is not empty), then an empty string is returned.
258*cdf0e10cSrcweir  */
259*cdf0e10cSrcweir void SAL_CALL rtl_uriEncode(rtl_uString * pText,
260*cdf0e10cSrcweir                             sal_Bool const * pCharClass,
261*cdf0e10cSrcweir                             rtl_UriEncodeMechanism eMechanism,
262*cdf0e10cSrcweir                             rtl_TextEncoding eCharset,
263*cdf0e10cSrcweir                             rtl_uString ** pResult)
264*cdf0e10cSrcweir     SAL_THROW_EXTERN_C();
265*cdf0e10cSrcweir 
266*cdf0e10cSrcweir /** Decode (a part of) a URI.
267*cdf0e10cSrcweir 
268*cdf0e10cSrcweir     @param pText
269*cdf0e10cSrcweir     Any Unicode string.  Must not be null.  (If the input is indeed part of a
270*cdf0e10cSrcweir     valid URI, this string will only contain a subset of the ASCII characters,
271*cdf0e10cSrcweir     but this function also handles other Unicode characters properly.)
272*cdf0e10cSrcweir 
273*cdf0e10cSrcweir     @param eMechanism
274*cdf0e10cSrcweir     The mechanism describing how the input text is translated into a Unicode
275*cdf0e10cSrcweir     string.
276*cdf0e10cSrcweir 
277*cdf0e10cSrcweir     @param eCharset
278*cdf0e10cSrcweir     When the decode mechanism is rtl_UriDecodeWithCharset, all escape
279*cdf0e10cSrcweir     sequences in the input text are interpreted as characters from this
280*cdf0e10cSrcweir     charset.  Those characters are translated to Unicode characters in the
281*cdf0e10cSrcweir     resulting output, if possible.
282*cdf0e10cSrcweir 
283*cdf0e10cSrcweir     When the decode mechanism is rtl_UriDecodeNone or rtl_UriDecodeToIuri,
284*cdf0e10cSrcweir     this parameter is ignored (and is best specified as
285*cdf0e10cSrcweir     RTL_TEXTENCODING_UTF8).
286*cdf0e10cSrcweir 
287*cdf0e10cSrcweir     @param pResult
288*cdf0e10cSrcweir     Returns a decoded representation of the input text.  Must itself not be
289*cdf0e10cSrcweir     null, and must point to either null or a valid string.
290*cdf0e10cSrcweir 
291*cdf0e10cSrcweir     If the decode mechanism is rtl_UriDecodeStrict, and pText cannot be
292*cdf0e10cSrcweir     converted to eCharset because it contains (encodings of) unmappable
293*cdf0e10cSrcweir     characters (which implies that pText is not empty), then an empty string is
294*cdf0e10cSrcweir     returned.
295*cdf0e10cSrcweir  */
296*cdf0e10cSrcweir void SAL_CALL rtl_uriDecode(rtl_uString * pText,
297*cdf0e10cSrcweir                             rtl_UriDecodeMechanism eMechanism,
298*cdf0e10cSrcweir                             rtl_TextEncoding eCharset,
299*cdf0e10cSrcweir                             rtl_uString ** pResult)
300*cdf0e10cSrcweir     SAL_THROW_EXTERN_C();
301*cdf0e10cSrcweir 
302*cdf0e10cSrcweir /** Convert a relative URI reference into an absolute one.
303*cdf0e10cSrcweir 
304*cdf0e10cSrcweir     A URI reference is a URI plus an optional <"#" fragment> part.
305*cdf0e10cSrcweir 
306*cdf0e10cSrcweir     This function uses the algorithm described in RFC 2396, section 5.2, with
307*cdf0e10cSrcweir     the following clarifications:  (1) Backwards-compatible relative URIs
308*cdf0e10cSrcweir     starting with a scheme component (see RFC 2396, section 5.2, step 3) are not
309*cdf0e10cSrcweir     supported.  (2) Segments "." and ".." within the path of the base URI are
310*cdf0e10cSrcweir     not considered special, RFC 2396 seems a bit unlcear about that point.
311*cdf0e10cSrcweir     (3) Erroneous excess segments ".." within the path of the relative URI (if
312*cdf0e10cSrcweir     it is indeed relative) are left intact, as the examples in RFC 2396,
313*cdf0e10cSrcweir     section C.2, suggest.  (4) If the relative URI is a reference to the
314*cdf0e10cSrcweir     "current document," the "current document" is taken to be the base URI.
315*cdf0e10cSrcweir 
316*cdf0e10cSrcweir     This function signals exceptions by returning false and letting pException
317*cdf0e10cSrcweir     point to a message explaining the exception.
318*cdf0e10cSrcweir 
319*cdf0e10cSrcweir     @param pBaseUriRef
320*cdf0e10cSrcweir     An absolute, hierarchical URI reference that serves as the base URI.  If it
321*cdf0e10cSrcweir     has to be inspected (i.e., pRelUriRef is not an absolute URI already), and
322*cdf0e10cSrcweir     if it either is not an absolute URI (i.e., does not begin with a
323*cdf0e10cSrcweir     <scheme ":"> part) or has a path that is non-empty but does not start
324*cdf0e10cSrcweir     with "/", an exception will be signaled.
325*cdf0e10cSrcweir 
326*cdf0e10cSrcweir     @param pRelUriRef
327*cdf0e10cSrcweir     An URI reference that may be either absolute or relative.  If it is
328*cdf0e10cSrcweir     absolute, it will be returned unmodified (and it need not be hierarchical
329*cdf0e10cSrcweir     then).
330*cdf0e10cSrcweir 
331*cdf0e10cSrcweir     @param pResult
332*cdf0e10cSrcweir     Returns an absolute URI reference.  Must itself not be null, and must point
333*cdf0e10cSrcweir     to either null or a valid string.  If an exception is signalled, it is left
334*cdf0e10cSrcweir     unchanged.
335*cdf0e10cSrcweir 
336*cdf0e10cSrcweir     @param pException
337*cdf0e10cSrcweir     Returns an explanatory message in case an exception is signalled.  Must
338*cdf0e10cSrcweir     itself not be null, and must point to either null or a valid string.  If no
339*cdf0e10cSrcweir     exception is signalled, it is left unchanged.
340*cdf0e10cSrcweir 
341*cdf0e10cSrcweir     @return
342*cdf0e10cSrcweir     True if no exception is signalled, otherwise false.
343*cdf0e10cSrcweir  */
344*cdf0e10cSrcweir sal_Bool SAL_CALL rtl_uriConvertRelToAbs(rtl_uString * pBaseUriRef,
345*cdf0e10cSrcweir                                          rtl_uString * pRelUriRef,
346*cdf0e10cSrcweir                                          rtl_uString ** pResult,
347*cdf0e10cSrcweir                                          rtl_uString ** pException)
348*cdf0e10cSrcweir     SAL_THROW_EXTERN_C();
349*cdf0e10cSrcweir 
350*cdf0e10cSrcweir #if defined __cplusplus
351*cdf0e10cSrcweir }
352*cdf0e10cSrcweir #endif /* __cplusplus */
353*cdf0e10cSrcweir 
354*cdf0e10cSrcweir #endif /* _RTL_URI_H_ */
355