xref: /aoo42x/main/l10ntools/source/wtratree.cxx (revision cdf0e10c)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_l10ntools.hxx"
30 
31 
32 #include "wtratree.hxx"
33 
34 
35 
36 /** @ATTENTION
37 	For reasons of speed, class WordTransTree works with two simple
38 	char arrays, sOutput and sInput, instead of secure containers or
39 	streams. So be extremely careful, when changing this code!!!
40 **/
41 
42 
43 
44 // NOT FULLY DECLARED SERVICES
45 #include <string.h>
46 #include <stdio.h>
47 #include <ctype.h>
48 #include "wtranode.hxx"
49 
50 
51 const BRANCH_T	BR_END			= 0;
52 const BRANCH_T	BR_NONALPHA     = 1;
53 const BRANCH_T	BR_HOTKEY       = 2;
54 const BRANCH_T	BR_BACKSLASH    = 3;
55 const BRANCH_T	BR_ALPHABASE    = 4;   	/// @ATTENTION  All branches not valid for words must be smaller than this value!
56 const BRANCH_T	BR_AE           = 30;
57 const BRANCH_T	BR_OE           = 31;
58 const BRANCH_T	BR_UE           = 32;
59 const BRANCH_T	BR_SZ           = 33;
60 const BRANCH_T	BR_MAX          = 34;	/// @ATTENTION  Must be updated always!
61 
62 const BRANCH_T	BR_START 		= 0;
63 
64 
65 
66 
67 
68 WordTransTree::WordTransTree(CharSet  i_nWorkingCharSet)
69 	:	sInput(0),
70 		nInputLength(0),
71 		pInputEnd(0),
72 		sOutput(0),
73 		nOutputMaxLength(0),
74 		dpParsingTreeTop(0),
75 		pUnknownAlpha(0),
76 		// cChar2Branch
77         c_AE(u_char('\xC4')), c_OE(u_char('\xD6')), c_UE(u_char('\xDC')),
78         c_ae(u_char('\xE4')), c_oe(u_char('\xF6')), c_ue(u_char('\xFC')),
79 		pInputCurTokenStart(0),
80 		pInputPosition(0),
81 		pOutputPosition(0),
82 		pCurParseNode(0),
83 		eCurResult(OK),
84 		cCurHotkey(0),
85 		cCurHotkeySign(u_char('~'))
86 {
87 	// Initialize parsing tree:
88 	pUnknownAlpha = new WTT_Node(BR_ALPHABASE,0,0);	// This will be deleted as part of the parsing tree.
89 	for ( UINT8 i = BR_ALPHABASE; i < C_NR_OF_BRANCHES; i++)
90 	{
91 		pUnknownAlpha->SetBranch(i,pUnknownAlpha);
92 	}  // end for
93 
94 	dpParsingTreeTop = new WTT_Node(BR_START,0,pUnknownAlpha);
95 
96 	WTT_Node * dpNonAlpha = new WTT_Node(BR_NONALPHA,0,0);
97 
98 	dpNonAlpha->SetBranch(BR_NONALPHA,dpNonAlpha);
99 	dpParsingTreeTop->SetBranch(BR_NONALPHA,dpNonAlpha);
100 
101 	WTT_Node * dpBackslash = new WTT_Node(BR_BACKSLASH,dpNonAlpha,dpNonAlpha);
102 	dpBackslash->SetBranch(BR_END,0);
103 
104 	dpParsingTreeTop->SetBranch(BR_BACKSLASH,dpBackslash);
105 	dpNonAlpha->SetBranch(BR_BACKSLASH,dpBackslash);
106 
107 
108 	// Initialize character set:
109 	SetCharSet(i_nWorkingCharSet);
110 
111 	if (C_BR_ALPHABASE != BR_ALPHABASE || C_NR_OF_BRANCHES != BR_MAX)
112 	{
113 		fprintf(stderr, "Assertion failed: file %s line %d.", __FILE__,  __LINE__);
114 		exit(1);
115 	}
116 }
117 
118 void
119 WordTransTree::SetCharSet(CharSet i_nWorkingCharSet)
120 {
121     ByteString sConvert("\xC4\xD6\xDC\xE4\xF6\xFC\xDF");
122 	const u_char * pConvert = (const u_char * ) ( sConvert.Convert(RTL_TEXTENCODING_MS_1252, i_nWorkingCharSet).GetBuffer() );
123 
124 	INT16 i = 0;
125 	for ( ; i < C_NR_OF_POSSIBLE_CHARS; ++i )
126 	{
127 		cChar2Branch[i] = BR_NONALPHA;
128 	}  // end for
129 	for ( i = 'a'; i <= 'z'; ++i )
130 	{
131 		cChar2Branch[i] = BR_ALPHABASE + i - 'a';
132 	}  // end for
133 	for ( i = 'A'; i <= 'Z'; ++i )
134 	{
135 		cChar2Branch[i] = BR_ALPHABASE + i - 'A';
136 	}  // end for
137 	cChar2Branch[pConvert[0]] = BR_AE;
138 	cChar2Branch[pConvert[1]] = BR_OE;
139 	cChar2Branch[pConvert[2]] = BR_UE;
140 	cChar2Branch[pConvert[3]] = BR_AE;
141 	cChar2Branch[pConvert[4]] = BR_OE;
142 	cChar2Branch[pConvert[5]] = BR_UE;
143 	cChar2Branch[pConvert[6]] = BR_SZ;
144 
145 	cChar2Branch[u_char('~')] = BR_HOTKEY;
146 	cChar2Branch[u_char('&')] = BR_HOTKEY;
147 
148 
149 	c_AE = pConvert[0];
150 	c_OE = pConvert[1];
151 	c_UE = pConvert[2];
152 	c_ae = pConvert[3];
153 	c_oe = pConvert[4];
154 	c_ue = pConvert[5];
155 }
156 
157 WordTransTree::~WordTransTree()
158 {
159 	delete dpParsingTreeTop;
160 	if (sOutput != 0)
161 		delete [] sOutput;
162 }
163 
164 void
165 WordTransTree::AddWordPair(	const ByteString &		i_sOldString,
166 							const ByteString &		i_sReplaceString )
167 {
168 	if (i_sOldString.Len() == 0)
169 		return;
170 
171 	pCurParseNode = dpParsingTreeTop;
172 	WTT_Node * pBranch = 0;
173 	char cBranch = 0;
174 
175 	for ( constr pOld = i_sOldString.GetBuffer();
176 		  *pOld != 0;
177 		  pOld++ )
178 	{
179 		cBranch = CalculateBranch(*pOld);
180 		pBranch = pCurParseNode->GetNextNode(cBranch);
181 		if (pBranch == 0 || pBranch == pUnknownAlpha)
182 		{
183 			pBranch = new WTT_Node(cBranch,0,pUnknownAlpha);
184 			pCurParseNode->SetBranch(cBranch,pBranch);
185 		}
186 		pCurParseNode = pBranch;
187 	}	// end for
188 	pCurParseNode->SetAsTokenToReplace(i_sReplaceString);
189 }
190 
191 void
192 WordTransTree::InitTransformation( const char *	i_sInput,
193 								   UINT32		i_nInputLength,
194 								   UINT32		i_nOutputMaxLength )
195 {
196 	sInput = (const u_char *)i_sInput;
197 	nInputLength = i_nInputLength;
198 	pInputEnd = &sInput[i_nInputLength];
199 
200 	pInputCurTokenStart = sInput;
201 	pInputPosition = sInput;
202 
203 	if (nOutputMaxLength < i_nOutputMaxLength)
204 	{
205 		if (sOutput != 0)
206 			delete [] sOutput;
207 		sOutput = new unsigned char[i_nOutputMaxLength];
208 		nOutputMaxLength = i_nOutputMaxLength;
209 	}
210 	pOutputPosition = sOutput;
211 }
212 
213 /**	pInputCurTokenStart and CurParseNode are updated just when
214 	starting this function. After its end they must not be changed
215 	till this functon is called again.
216 	Outside this function pInputPositon and pOutputPosition are both
217 	on the first not transformed char in their respective array.
218 **/
219 WordTransTree::E_Result
220 WordTransTree::TransformNextToken()
221 {
222 	pInputCurTokenStart = pInputPosition;
223 	pCurParseNode = dpParsingTreeTop;
224 	cCurHotkey = 0;
225     eCurResult = OK;
226 
227 	WTT_Node * pBranch = 0;
228 	UINT8 cBranch = 0;
229 
230 	for ( pCurParseNode = dpParsingTreeTop;
231 		  pInputPosition != pInputEnd;
232 		  ++pInputPosition )
233 	{
234 		cBranch = CalculateBranch(*pInputPosition);
235 		pBranch = pCurParseNode->GetNextNode( cBranch );
236 		if (pBranch != 0)
237 		{
238 			pCurParseNode = pBranch;
239 		}
240 		else
241 		{
242 			if (cBranch == BR_HOTKEY)   // current letter is '~' or '&'.
243 			{
244 				// Logic of the following. There are 9 possible cases -
245 				// A = alphabetic letter, NA = non alphabetic, TB = token begin,
246 				// Eot = end of text:
247 				//	 1.	A~A          set hotkey to following letter, continue
248 				//	 2.	A~NA         token end
249 				//	 3.	A~Eot        token end
250 				//	 4.	NA~A         token end
251 				//	 5.	NA~NA        continue
252 				//	 6.	A~Eof        continue
253 				//	 7.	TB~A         set hotkey to following letter, continue
254 				//	 8.	TB~NA        continue
255 				//	 9.	TB~Eot       continue
256 
257 				// bNext and Prev are true, if there are alphabetic letters:
258 				sal_Bool bNext =  pInputPosition + 1 != pInputEnd
259 									?   CalculateBranch(pInputPosition[1]) >= BR_ALPHABASE
260 									: 	sal_False;
261 				sal_Bool bPrev = pCurParseNode->Value() >= BR_ALPHABASE;
262 
263 				if ( bNext && (bPrev || pCurParseNode == dpParsingTreeTop) )
264 				{   // case 1. and 7.
265 					Handle_Hotkey();
266 					continue;
267 				}
268 				else if  (!bPrev && !bNext)
269 				{   // case 5.,6.,8.,9.
270 					continue;
271 				}
272 
273 				// Case 2.,3.,4. :
274 				// 	so this should be handled as an end of a token.
275 			}
276 			if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
277 			{
278 				Handle_TokenToKeep();
279 				return eCurResult;
280 			}
281 			else
282 			{
283 				Handle_TokenToTransform();
284 				return eCurResult;
285 			}	// endif (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
286 		} 	// endif (pBranch == 0) else
287 	}	// end for
288 
289 	// If here, the text end is reached
290 	if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
291 	{
292 		Handle_TokenToKeep();
293 		return eCurResult;
294 	}
295 	else
296 	{
297 		Handle_TokenToTransform();
298 		return eCurResult;
299 	}
300 }
301 
302 ByteString
303 WordTransTree::CurReplacingString() const
304 {
305 	return pCurParseNode->ReplaceString();
306 }
307 
308 void
309 WordTransTree::Handle_Hotkey()
310 {
311 	if (cCurHotkey == 0) 	// Avoid to replace the first found hotkey by
312 	                        //   a later one - though this shouldn't happen anyway.
313 	{
314 		cCurHotkey = (pInputPosition+1) != pInputEnd ? pInputPosition[1] : 0;
315 		cCurHotkeySign = *pInputPosition;
316 	}
317 }
318 
319 void
320 WordTransTree::Handle_TokenToKeep()
321 {
322 	UINT32 nTokenLength = pInputPosition-pInputCurTokenStart;
323 
324 	memcpy(pOutputPosition,pInputCurTokenStart,nTokenLength);
325 
326 	pOutputPosition += nTokenLength;
327 	*pOutputPosition = '\0';
328 }
329 
330 void
331 WordTransTree::Handle_TokenToTransform()
332 {
333 	sal_Bool bHaveHotkey = CalculateBranch(cCurHotkey) >= BR_ALPHABASE;
334 	const ByteString & rReplace = pCurParseNode->ReplaceString();
335 
336 	// Find position of hotkey in replace-string:
337 	sal_uInt16 nHotkeyPos = bHaveHotkey
338 							?	rReplace.Search(char(cCurHotkey))
339 							:	STRING_NOTFOUND;
340 	if (nHotkeyPos == STRING_NOTFOUND && bHaveHotkey)
341 	{
342 		if (cCurHotkey < 128)
343 		{
344 			if (islower(cCurHotkey))
345 				nHotkeyPos = rReplace.Search(toupper(char(cCurHotkey)));
346 			else
347 				nHotkeyPos = rReplace.Search(tolower(char(cCurHotkey)));
348 		}
349 		else	// cCurHotkey >= 128
350 		{
351 			if (cCurHotkey == c_ae)
352 				nHotkeyPos = rReplace.Search(char(c_AE));
353 			else if (cCurHotkey == c_oe)
354 				nHotkeyPos = rReplace.Search(char(c_OE));
355 			else if (cCurHotkey == c_ue)
356 				nHotkeyPos = rReplace.Search(char(c_UE));
357 			else if (cCurHotkey == c_AE)
358 				nHotkeyPos = rReplace.Search(char(c_ae));
359 			else if (cCurHotkey == c_OE)
360 				nHotkeyPos = rReplace.Search(char(c_oe));
361 			else if (cCurHotkey == c_UE)
362 				nHotkeyPos = rReplace.Search(char(c_ue));
363 		}	// endif (cCurHotkey < 128) else
364 
365 		if (nHotkeyPos == STRING_NOTFOUND)
366 		{
367 			eCurResult = HOTKEY_LOST;
368 			bHaveHotkey = sal_False;
369 		}
370 	} 	// endif (nHotkeyPos == STRING_NOT_FOUND && bHaveHotkey)
371 
372 
373 	UINT32 nOutputTokenLength = rReplace.Len() + (bHaveHotkey ? 1 : 0);
374 
375 	if (bHaveHotkey)
376 	{
377 		memcpy( pOutputPosition,
378 				pCurParseNode->ReplaceString().GetBuffer(),
379 				nHotkeyPos );
380 		*(pOutputPosition + nHotkeyPos) = cCurHotkeySign;
381 		memcpy( pOutputPosition + nHotkeyPos + 1,
382 				pCurParseNode->ReplaceString().GetBuffer() + nHotkeyPos,
383 				nOutputTokenLength - nHotkeyPos - 1);
384 	}
385 	else
386 	{
387 		memcpy( pOutputPosition,
388 				pCurParseNode->ReplaceString().GetBuffer(),
389 				nOutputTokenLength );
390 	}
391 
392 	// Convert first letter into upper if necessary:
393 	u_char cInStart = CalculateBranch(*pInputCurTokenStart) == BR_HOTKEY
394 							? 	pInputCurTokenStart[1]
395 							:	pInputCurTokenStart[0] ;
396 	u_char * pOutStart = nHotkeyPos == 0
397 							? 	pOutputPosition + 1
398 							:	pOutputPosition ;
399 	if (isupper(cInStart) || cInStart > 127)
400 	{   // Possibly cInStart is upper character:
401 		if (isupper(cInStart) || cInStart == c_AE || cInStart == c_OE || cInStart == c_UE)
402 		{	// Surely cInStart is upper character:
403 			u_char cOutStart = *pOutStart;
404 			if (cOutStart < 128)
405 				*pOutStart = toupper(cOutStart);
406 			else if (cOutStart == c_ae)
407 				*pOutStart = c_AE;
408 			else if (cOutStart == c_oe)
409 				*pOutStart = c_OE;
410 			else if (cOutStart == c_ue)
411 				*pOutStart = c_UE;
412 		}
413 	}  	// endif (isupper(cInStart) || cInStart > 127)
414 
415 	pOutputPosition += nOutputTokenLength;
416 	*pOutputPosition = '\0';
417 }
418 
419