1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_l10ntools.hxx"
26
27
28 #include "wtratree.hxx"
29
30
31
32 /** @ATTENTION
33 For reasons of speed, class WordTransTree works with two simple
34 char arrays, sOutput and sInput, instead of secure containers or
35 streams. So be extremely careful, when changing this code!!!
36 **/
37
38
39
40 // NOT FULLY DECLARED SERVICES
41 #include <string.h>
42 #include <stdio.h>
43 #include <ctype.h>
44 #include "wtranode.hxx"
45
46
47 const BRANCH_T BR_END = 0;
48 const BRANCH_T BR_NONALPHA = 1;
49 const BRANCH_T BR_HOTKEY = 2;
50 const BRANCH_T BR_BACKSLASH = 3;
51 const BRANCH_T BR_ALPHABASE = 4; /// @ATTENTION All branches not valid for words must be smaller than this value!
52 const BRANCH_T BR_AE = 30;
53 const BRANCH_T BR_OE = 31;
54 const BRANCH_T BR_UE = 32;
55 const BRANCH_T BR_SZ = 33;
56 const BRANCH_T BR_MAX = 34; /// @ATTENTION Must be updated always!
57
58 const BRANCH_T BR_START = 0;
59
60
61
62
63
WordTransTree(CharSet i_nWorkingCharSet)64 WordTransTree::WordTransTree(CharSet i_nWorkingCharSet)
65 : sInput(0),
66 nInputLength(0),
67 pInputEnd(0),
68 sOutput(0),
69 nOutputMaxLength(0),
70 dpParsingTreeTop(0),
71 pUnknownAlpha(0),
72 // cChar2Branch
73 c_AE(u_char('\xC4')), c_OE(u_char('\xD6')), c_UE(u_char('\xDC')),
74 c_ae(u_char('\xE4')), c_oe(u_char('\xF6')), c_ue(u_char('\xFC')),
75 pInputCurTokenStart(0),
76 pInputPosition(0),
77 pOutputPosition(0),
78 pCurParseNode(0),
79 eCurResult(OK),
80 cCurHotkey(0),
81 cCurHotkeySign(u_char('~'))
82 {
83 // Initialize parsing tree:
84 pUnknownAlpha = new WTT_Node(BR_ALPHABASE,0,0); // This will be deleted as part of the parsing tree.
85 for ( UINT8 i = BR_ALPHABASE; i < C_NR_OF_BRANCHES; i++)
86 {
87 pUnknownAlpha->SetBranch(i,pUnknownAlpha);
88 } // end for
89
90 dpParsingTreeTop = new WTT_Node(BR_START,0,pUnknownAlpha);
91
92 WTT_Node * dpNonAlpha = new WTT_Node(BR_NONALPHA,0,0);
93
94 dpNonAlpha->SetBranch(BR_NONALPHA,dpNonAlpha);
95 dpParsingTreeTop->SetBranch(BR_NONALPHA,dpNonAlpha);
96
97 WTT_Node * dpBackslash = new WTT_Node(BR_BACKSLASH,dpNonAlpha,dpNonAlpha);
98 dpBackslash->SetBranch(BR_END,0);
99
100 dpParsingTreeTop->SetBranch(BR_BACKSLASH,dpBackslash);
101 dpNonAlpha->SetBranch(BR_BACKSLASH,dpBackslash);
102
103
104 // Initialize character set:
105 SetCharSet(i_nWorkingCharSet);
106
107 if (C_BR_ALPHABASE != BR_ALPHABASE || C_NR_OF_BRANCHES != BR_MAX)
108 {
109 fprintf(stderr, "Assertion failed: file %s line %d.", __FILE__, __LINE__);
110 exit(1);
111 }
112 }
113
114 void
SetCharSet(CharSet i_nWorkingCharSet)115 WordTransTree::SetCharSet(CharSet i_nWorkingCharSet)
116 {
117 ByteString sConvert("\xC4\xD6\xDC\xE4\xF6\xFC\xDF");
118 const u_char * pConvert = (const u_char * ) ( sConvert.Convert(RTL_TEXTENCODING_MS_1252, i_nWorkingCharSet).GetBuffer() );
119
120 INT16 i = 0;
121 for ( ; i < C_NR_OF_POSSIBLE_CHARS; ++i )
122 {
123 cChar2Branch[i] = BR_NONALPHA;
124 } // end for
125 for ( i = 'a'; i <= 'z'; ++i )
126 {
127 cChar2Branch[i] = BR_ALPHABASE + i - 'a';
128 } // end for
129 for ( i = 'A'; i <= 'Z'; ++i )
130 {
131 cChar2Branch[i] = BR_ALPHABASE + i - 'A';
132 } // end for
133 cChar2Branch[pConvert[0]] = BR_AE;
134 cChar2Branch[pConvert[1]] = BR_OE;
135 cChar2Branch[pConvert[2]] = BR_UE;
136 cChar2Branch[pConvert[3]] = BR_AE;
137 cChar2Branch[pConvert[4]] = BR_OE;
138 cChar2Branch[pConvert[5]] = BR_UE;
139 cChar2Branch[pConvert[6]] = BR_SZ;
140
141 cChar2Branch[u_char('~')] = BR_HOTKEY;
142 cChar2Branch[u_char('&')] = BR_HOTKEY;
143
144
145 c_AE = pConvert[0];
146 c_OE = pConvert[1];
147 c_UE = pConvert[2];
148 c_ae = pConvert[3];
149 c_oe = pConvert[4];
150 c_ue = pConvert[5];
151 }
152
~WordTransTree()153 WordTransTree::~WordTransTree()
154 {
155 delete dpParsingTreeTop;
156 if (sOutput != 0)
157 delete [] sOutput;
158 }
159
160 void
AddWordPair(const ByteString & i_sOldString,const ByteString & i_sReplaceString)161 WordTransTree::AddWordPair( const ByteString & i_sOldString,
162 const ByteString & i_sReplaceString )
163 {
164 if (i_sOldString.Len() == 0)
165 return;
166
167 pCurParseNode = dpParsingTreeTop;
168 WTT_Node * pBranch = 0;
169 char cBranch = 0;
170
171 for ( constr pOld = i_sOldString.GetBuffer();
172 *pOld != 0;
173 pOld++ )
174 {
175 cBranch = CalculateBranch(*pOld);
176 pBranch = pCurParseNode->GetNextNode(cBranch);
177 if (pBranch == 0 || pBranch == pUnknownAlpha)
178 {
179 pBranch = new WTT_Node(cBranch,0,pUnknownAlpha);
180 pCurParseNode->SetBranch(cBranch,pBranch);
181 }
182 pCurParseNode = pBranch;
183 } // end for
184 pCurParseNode->SetAsTokenToReplace(i_sReplaceString);
185 }
186
187 void
InitTransformation(const char * i_sInput,UINT32 i_nInputLength,UINT32 i_nOutputMaxLength)188 WordTransTree::InitTransformation( const char * i_sInput,
189 UINT32 i_nInputLength,
190 UINT32 i_nOutputMaxLength )
191 {
192 sInput = (const u_char *)i_sInput;
193 nInputLength = i_nInputLength;
194 pInputEnd = &sInput[i_nInputLength];
195
196 pInputCurTokenStart = sInput;
197 pInputPosition = sInput;
198
199 if (nOutputMaxLength < i_nOutputMaxLength)
200 {
201 if (sOutput != 0)
202 delete [] sOutput;
203 sOutput = new unsigned char[i_nOutputMaxLength];
204 nOutputMaxLength = i_nOutputMaxLength;
205 }
206 pOutputPosition = sOutput;
207 }
208
209 /** pInputCurTokenStart and CurParseNode are updated just when
210 starting this function. After its end they must not be changed
211 till this functon is called again.
212 Outside this function pInputPositon and pOutputPosition are both
213 on the first not transformed char in their respective array.
214 **/
215 WordTransTree::E_Result
TransformNextToken()216 WordTransTree::TransformNextToken()
217 {
218 pInputCurTokenStart = pInputPosition;
219 pCurParseNode = dpParsingTreeTop;
220 cCurHotkey = 0;
221 eCurResult = OK;
222
223 WTT_Node * pBranch = 0;
224 UINT8 cBranch = 0;
225
226 for ( pCurParseNode = dpParsingTreeTop;
227 pInputPosition != pInputEnd;
228 ++pInputPosition )
229 {
230 cBranch = CalculateBranch(*pInputPosition);
231 pBranch = pCurParseNode->GetNextNode( cBranch );
232 if (pBranch != 0)
233 {
234 pCurParseNode = pBranch;
235 }
236 else
237 {
238 if (cBranch == BR_HOTKEY) // current letter is '~' or '&'.
239 {
240 // Logic of the following. There are 9 possible cases -
241 // A = alphabetic letter, NA = non alphabetic, TB = token begin,
242 // Eot = end of text:
243 // 1. A~A set hotkey to following letter, continue
244 // 2. A~NA token end
245 // 3. A~Eot token end
246 // 4. NA~A token end
247 // 5. NA~NA continue
248 // 6. A~Eof continue
249 // 7. TB~A set hotkey to following letter, continue
250 // 8. TB~NA continue
251 // 9. TB~Eot continue
252
253 // bNext and Prev are true, if there are alphabetic letters:
254 sal_Bool bNext = pInputPosition + 1 != pInputEnd
255 ? CalculateBranch(pInputPosition[1]) >= BR_ALPHABASE
256 : sal_False;
257 sal_Bool bPrev = pCurParseNode->Value() >= BR_ALPHABASE;
258
259 if ( bNext && (bPrev || pCurParseNode == dpParsingTreeTop) )
260 { // case 1. and 7.
261 Handle_Hotkey();
262 continue;
263 }
264 else if (!bPrev && !bNext)
265 { // case 5.,6.,8.,9.
266 continue;
267 }
268
269 // Case 2.,3.,4. :
270 // so this should be handled as an end of a token.
271 }
272 if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
273 {
274 Handle_TokenToKeep();
275 return eCurResult;
276 }
277 else
278 {
279 Handle_TokenToTransform();
280 return eCurResult;
281 } // endif (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
282 } // endif (pBranch == 0) else
283 } // end for
284
285 // If here, the text end is reached
286 if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
287 {
288 Handle_TokenToKeep();
289 return eCurResult;
290 }
291 else
292 {
293 Handle_TokenToTransform();
294 return eCurResult;
295 }
296 }
297
298 ByteString
CurReplacingString() const299 WordTransTree::CurReplacingString() const
300 {
301 return pCurParseNode->ReplaceString();
302 }
303
304 void
Handle_Hotkey()305 WordTransTree::Handle_Hotkey()
306 {
307 if (cCurHotkey == 0) // Avoid to replace the first found hotkey by
308 // a later one - though this shouldn't happen anyway.
309 {
310 cCurHotkey = (pInputPosition+1) != pInputEnd ? pInputPosition[1] : 0;
311 cCurHotkeySign = *pInputPosition;
312 }
313 }
314
315 void
Handle_TokenToKeep()316 WordTransTree::Handle_TokenToKeep()
317 {
318 UINT32 nTokenLength = pInputPosition-pInputCurTokenStart;
319
320 memcpy(pOutputPosition,pInputCurTokenStart,nTokenLength);
321
322 pOutputPosition += nTokenLength;
323 *pOutputPosition = '\0';
324 }
325
326 void
Handle_TokenToTransform()327 WordTransTree::Handle_TokenToTransform()
328 {
329 sal_Bool bHaveHotkey = CalculateBranch(cCurHotkey) >= BR_ALPHABASE;
330 const ByteString & rReplace = pCurParseNode->ReplaceString();
331
332 // Find position of hotkey in replace-string:
333 sal_uInt16 nHotkeyPos = bHaveHotkey
334 ? rReplace.Search(char(cCurHotkey))
335 : STRING_NOTFOUND;
336 if (nHotkeyPos == STRING_NOTFOUND && bHaveHotkey)
337 {
338 if (cCurHotkey < 128)
339 {
340 if (islower(cCurHotkey))
341 nHotkeyPos = rReplace.Search(toupper(char(cCurHotkey)));
342 else
343 nHotkeyPos = rReplace.Search(tolower(char(cCurHotkey)));
344 }
345 else // cCurHotkey >= 128
346 {
347 if (cCurHotkey == c_ae)
348 nHotkeyPos = rReplace.Search(char(c_AE));
349 else if (cCurHotkey == c_oe)
350 nHotkeyPos = rReplace.Search(char(c_OE));
351 else if (cCurHotkey == c_ue)
352 nHotkeyPos = rReplace.Search(char(c_UE));
353 else if (cCurHotkey == c_AE)
354 nHotkeyPos = rReplace.Search(char(c_ae));
355 else if (cCurHotkey == c_OE)
356 nHotkeyPos = rReplace.Search(char(c_oe));
357 else if (cCurHotkey == c_UE)
358 nHotkeyPos = rReplace.Search(char(c_ue));
359 } // endif (cCurHotkey < 128) else
360
361 if (nHotkeyPos == STRING_NOTFOUND)
362 {
363 eCurResult = HOTKEY_LOST;
364 bHaveHotkey = sal_False;
365 }
366 } // endif (nHotkeyPos == STRING_NOT_FOUND && bHaveHotkey)
367
368
369 UINT32 nOutputTokenLength = rReplace.Len() + (bHaveHotkey ? 1 : 0);
370
371 if (bHaveHotkey)
372 {
373 memcpy( pOutputPosition,
374 pCurParseNode->ReplaceString().GetBuffer(),
375 nHotkeyPos );
376 *(pOutputPosition + nHotkeyPos) = cCurHotkeySign;
377 memcpy( pOutputPosition + nHotkeyPos + 1,
378 pCurParseNode->ReplaceString().GetBuffer() + nHotkeyPos,
379 nOutputTokenLength - nHotkeyPos - 1);
380 }
381 else
382 {
383 memcpy( pOutputPosition,
384 pCurParseNode->ReplaceString().GetBuffer(),
385 nOutputTokenLength );
386 }
387
388 // Convert first letter into upper if necessary:
389 u_char cInStart = CalculateBranch(*pInputCurTokenStart) == BR_HOTKEY
390 ? pInputCurTokenStart[1]
391 : pInputCurTokenStart[0] ;
392 u_char * pOutStart = nHotkeyPos == 0
393 ? pOutputPosition + 1
394 : pOutputPosition ;
395 if (isupper(cInStart) || cInStart > 127)
396 { // Possibly cInStart is upper character:
397 if (isupper(cInStart) || cInStart == c_AE || cInStart == c_OE || cInStart == c_UE)
398 { // Surely cInStart is upper character:
399 u_char cOutStart = *pOutStart;
400 if (cOutStart < 128)
401 *pOutStart = toupper(cOutStart);
402 else if (cOutStart == c_ae)
403 *pOutStart = c_AE;
404 else if (cOutStart == c_oe)
405 *pOutStart = c_OE;
406 else if (cOutStart == c_ue)
407 *pOutStart = c_UE;
408 }
409 } // endif (isupper(cInStart) || cInStart > 127)
410
411 pOutputPosition += nOutputTokenLength;
412 *pOutputPosition = '\0';
413 }
414
415