xref: /aoo4110/main/sw/source/filter/ascii/parasc.cxx (revision b1cdbd2c)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_sw.hxx"
26 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */
27 
28 
29 #include <tools/stream.hxx>
30 #include <hintids.hxx>
31 #include <rtl/tencinfo.h>
32 #include <sfx2/printer.hxx>
33 #include <editeng/fontitem.hxx>
34 #include <editeng/langitem.hxx>
35 #include <editeng/brkitem.hxx>
36 #include <editeng/scripttypeitem.hxx>
37 #include <shellio.hxx>
38 #include <doc.hxx>
39 #include <swtypes.hxx>
40 #include <ndtxt.hxx>
41 #include <pam.hxx>
42 #include <frmatr.hxx>
43 #include <fltini.hxx>
44 #include <pagedesc.hxx>
45 #include <breakit.hxx>
46 #include <swerror.h>
47 #ifndef _STATSTR_HRC
48 #include <statstr.hrc>          // ResId fuer Statusleiste
49 #endif
50 #include <mdiexp.hxx>           // ...Percent()
51 #include <poolfmt.hxx>
52 
53 #include "vcl/metric.hxx"
54 
55 #define ASC_BUFFLEN 4096
56 
57 class SwASCIIParser
58 {
59 	SwDoc* pDoc;
60 	SwPaM* pPam;
61 	SvStream& rInput;
62 	sal_Char* pArr;
63 	const SwAsciiOptions& rOpt;
64 	SfxItemSet* pItemSet;
65 	long nFileSize;
66 	sal_uInt16 nScript;
67 	bool bNewDoc;
68 
69 	sal_uLong ReadChars();
70 	void InsertText( const String& rStr );
71 
72 public:
73 	SwASCIIParser( SwDoc* pD, const SwPaM& rCrsr, SvStream& rIn,
74 							int bReadNewDoc, const SwAsciiOptions& rOpts );
75 	~SwASCIIParser();
76 
77 	sal_uLong CallParser();
78 };
79 
80 
81 // Aufruf fuer die allg. Reader-Schnittstelle
Read(SwDoc & rDoc,const String &,SwPaM & rPam,const String &)82 sal_uLong AsciiReader::Read( SwDoc &rDoc, const String&, SwPaM &rPam, const String & )
83 {
84 	if( !pStrm )
85 	{
86 		ASSERT( !this, "ASCII-Read ohne Stream" );
87 		return ERR_SWG_READ_ERROR;
88 	}
89 
90 	//JP 18.01.96: Alle Ueberschriften sind normalerweise ohne
91 	//              Kapitelnummer. Darum hier explizit abschalten
92 	//              weil das Default jetzt wieder auf AN ist.
93 	if( !bInsertMode )
94 		Reader::SetNoOutlineNum( rDoc );
95 
96 	SwASCIIParser* pParser = new SwASCIIParser( &rDoc, rPam, *pStrm,
97 										!bInsertMode, aOpt.GetASCIIOpts() );
98 	sal_uLong nRet = pParser->CallParser();
99 
100 	delete pParser;
101 	// after Read reset the options
102 	aOpt.ResetASCIIOpts();
103 	return nRet;
104 }
105 
SwASCIIParser(SwDoc * pD,const SwPaM & rCrsr,SvStream & rIn,int bReadNewDoc,const SwAsciiOptions & rOpts)106 SwASCIIParser::SwASCIIParser(SwDoc* pD, const SwPaM& rCrsr, SvStream& rIn,
107     int bReadNewDoc, const SwAsciiOptions& rOpts)
108 	: pDoc(pD), rInput(rIn), rOpt(rOpts), nScript(0), bNewDoc(bReadNewDoc)
109 {
110 	pPam = new SwPaM( *rCrsr.GetPoint() );
111 	pArr = new sal_Char [ ASC_BUFFLEN + 2 ];
112 
113 	pItemSet = new SfxItemSet( pDoc->GetAttrPool(),
114 				RES_CHRATR_FONT,		RES_CHRATR_LANGUAGE,
115 				RES_CHRATR_CJK_FONT,	RES_CHRATR_CJK_LANGUAGE,
116 				RES_CHRATR_CTL_FONT,	RES_CHRATR_CTL_LANGUAGE,
117 				0 );
118 
119 	// set defaults from the options
120 	if( rOpt.GetLanguage() )
121 	{
122 		SvxLanguageItem aLang( (LanguageType)rOpt.GetLanguage(),
123 						 		RES_CHRATR_LANGUAGE );
124 		pItemSet->Put( aLang );
125 		pItemSet->Put( aLang, RES_CHRATR_CJK_LANGUAGE );
126 		pItemSet->Put( aLang, RES_CHRATR_CTL_LANGUAGE );
127 	}
128 	if( rOpt.GetFontName().Len() )
129 	{
130 		Font aTextFont( rOpt.GetFontName(), Size( 0, 10 ) );
131         if( pDoc->getPrinter( false ) )
132             aTextFont = pDoc->getPrinter( false )->GetFontMetric( aTextFont );
133 		SvxFontItem aFont( aTextFont.GetFamily(), aTextFont.GetName(),
134                            aEmptyStr, aTextFont.GetPitch(), aTextFont.GetCharSet(), RES_CHRATR_FONT );
135 		pItemSet->Put( aFont );
136 		pItemSet->Put( aFont, RES_CHRATR_CJK_FONT );
137 		pItemSet->Put( aFont, RES_CHRATR_CTL_FONT );
138 	}
139 }
140 
~SwASCIIParser()141 SwASCIIParser::~SwASCIIParser()
142 {
143 	delete pPam;
144 	delete [] pArr;
145 	delete pItemSet;
146 }
147 
148 
149 // Aufruf des Parsers
CallParser()150 sal_uLong SwASCIIParser::CallParser()
151 {
152 	rInput.Seek(STREAM_SEEK_TO_END);
153 	rInput.ResetError();
154 
155 	nFileSize = rInput.Tell();
156 	rInput.Seek(STREAM_SEEK_TO_BEGIN);
157 	rInput.ResetError();
158 
159 	::StartProgress( STR_STATSTR_W4WREAD, 0, nFileSize, pDoc->GetDocShell() );
160 
161 	SwPaM* pInsPam = 0;
162 	xub_StrLen nSttCntnt = 0;
163 	if (!bNewDoc)
164 	{
165 		const SwNodeIndex& rTmp = pPam->GetPoint()->nNode;
166 		pInsPam = new SwPaM( rTmp, rTmp, 0, -1 );
167 		nSttCntnt = pPam->GetPoint()->nContent.GetIndex();
168 	}
169 
170     SwTxtFmtColl *pColl = 0;
171 
172     if (bNewDoc)
173     {
174         pColl = pDoc->GetTxtCollFromPool(RES_POOLCOLL_HTML_PRE, false);
175         if (!pColl)
176             pColl = pDoc->GetTxtCollFromPool(RES_POOLCOLL_STANDARD,false);
177         if (pColl)
178             pDoc->SetTxtFmtColl(*pPam, pColl);
179     }
180 
181 	sal_uLong nError = ReadChars();
182 
183 	if( pItemSet )
184 	{
185 		// set only the attribute, for scanned scripts.
186 		if( !( SCRIPTTYPE_LATIN & nScript ))
187 		{
188 			pItemSet->ClearItem( RES_CHRATR_FONT );
189 			pItemSet->ClearItem( RES_CHRATR_LANGUAGE );
190 		}
191 		if( !( SCRIPTTYPE_ASIAN & nScript ))
192 		{
193 			pItemSet->ClearItem( RES_CHRATR_CJK_FONT );
194 			pItemSet->ClearItem( RES_CHRATR_CJK_LANGUAGE );
195 		}
196 		if( !( SCRIPTTYPE_COMPLEX & nScript ))
197 		{
198 			pItemSet->ClearItem( RES_CHRATR_CTL_FONT );
199 			pItemSet->ClearItem( RES_CHRATR_CTL_LANGUAGE );
200 		}
201 		if( pItemSet->Count() )
202 		{
203 			if( bNewDoc )
204 			{
205                 if (pColl)
206                 {
207 				    // Using the pool defaults for the font causes significant
208 				    // trouble for the HTML filter, because it is not able
209 				    // to export the pool defaults (or to be more precice:
210 				    // the HTML filter is not able to detect whether a pool
211 				    // default has changed or not. Even a comparison with the
212 				    // HTMLi template does not work, because the defaults are
213 				    // not copied when a new doc is created. The result of
214 				    // comparing pool defaults therfor would be that the
215 				    // defaults are exported always if the have changed for
216 				    // text documents in general. That's not sensible, as well
217 				    // as it is not sensible to export them always.
218 				    sal_uInt16 aWhichIds[4] =
219                     {
220                         RES_CHRATR_FONT, RES_CHRATR_CJK_FONT,
221                         RES_CHRATR_CTL_FONT, 0
222                     };
223 				    sal_uInt16 *pWhichIds = aWhichIds;
224 				    while (*pWhichIds)
225 				    {
226 				        const SfxPoolItem *pItem;
227 					    if (SFX_ITEM_SET == pItemSet->GetItemState(*pWhichIds,
228                             false, &pItem))
229 					    {
230                             pColl->SetFmtAttr( *pItem );
231 						    pItemSet->ClearItem( *pWhichIds );
232 					    }
233 					    ++pWhichIds;
234 				    }
235                 }
236 				if (pItemSet->Count())
237 					pDoc->SetDefault(*pItemSet);
238 			}
239 			else if( pInsPam )
240 			{
241 				// then set over the insert range the defined attributes
242 				*pInsPam->GetMark() = *pPam->GetPoint();
243 				pInsPam->GetPoint()->nNode++;
244 				pInsPam->GetPoint()->nContent.Assign(
245 									pInsPam->GetCntntNode(), nSttCntnt );
246 
247 				// !!!!!
248 				ASSERT( !this, "Have to change - hard attr. to para. style" );
249                 pDoc->InsertItemSet( *pInsPam, *pItemSet, 0 );
250 			}
251 		}
252 		delete pItemSet, pItemSet = 0;
253 	}
254 
255 	if( pInsPam )
256 		delete pInsPam;
257 
258 	::EndProgress( pDoc->GetDocShell() );
259 	return nError;
260 }
261 
ReadChars()262 sal_uLong SwASCIIParser::ReadChars()
263 {
264 	sal_Unicode *pStt = 0, *pEnd = 0, *pLastStt = 0;
265 	long nReadCnt = 0, nLineLen = 0;
266 	sal_Unicode cLastCR = 0;
267     bool bSwapUnicode = false;
268 
269     const SwAsciiOptions *pUseMe=&rOpt;
270     SwAsciiOptions aEmpty;
271     if (nFileSize >= 2 &&
272         aEmpty.GetFontName() == rOpt.GetFontName() &&
273         aEmpty.GetCharSet() == rOpt.GetCharSet() &&
274         aEmpty.GetLanguage() == rOpt.GetLanguage() &&
275         aEmpty.GetParaFlags() == rOpt.GetParaFlags())
276     {
277         sal_uLong nLen, nOrig;
278         nOrig = nLen = rInput.Read(pArr, ASC_BUFFLEN);
279         CharSet eCharSet;
280         bool bRet = SwIoSystem::IsDetectableText(pArr, nLen, &eCharSet, &bSwapUnicode);
281         ASSERT(bRet, "Autodetect of text import without nag dialog must "
282             "have failed");
283         if (bRet && eCharSet != RTL_TEXTENCODING_DONTKNOW)
284         {
285             aEmpty.SetCharSet(eCharSet);
286             rInput.SeekRel(-(long(nLen)));
287         }
288         else
289             rInput.SeekRel(-(long(nOrig)));
290         pUseMe=&aEmpty;
291     }
292 
293 	rtl_TextToUnicodeConverter hConverter=0;
294 	rtl_TextToUnicodeContext hContext=0;
295 	CharSet currentCharSet = pUseMe->GetCharSet();
296 	if (RTL_TEXTENCODING_UCS2 != currentCharSet)
297 	{
298 		if( currentCharSet == RTL_TEXTENCODING_DONTKNOW )
299 	            currentCharSet = RTL_TEXTENCODING_ASCII_US;
300 		hConverter = rtl_createTextToUnicodeConverter( currentCharSet );
301 		ASSERT( hConverter, "no string convert avaiable" );
302 		if (!hConverter)
303             return ERROR_SW_READ_BASE;
304 		bSwapUnicode = false;
305 		hContext = rtl_createTextToUnicodeContext( hConverter );
306 	}
307 	else if (pUseMe != &aEmpty)  //Already successfully figured out type
308 	{
309 		rInput.StartReadingUnicodeText( currentCharSet );
310 		bSwapUnicode = rInput.IsEndianSwap();
311 	}
312 
313 	String sWork;
314 	sal_uLong nArrOffset = 0;
315 
316 	do {
317 		if( pStt >= pEnd )
318 		{
319 			if( pLastStt != pStt )
320 				InsertText( String( pLastStt ));
321 
322 			// lese einen neuen Block ein
323 			sal_uLong lGCount;
324 			if( SVSTREAM_OK != rInput.GetError() || 0 == (lGCount =
325 						rInput.Read( pArr + nArrOffset,
326 									 ASC_BUFFLEN - nArrOffset )))
327 				break;		// aus der WHILE-Schleife heraus
328 
329             /*
330             #98380#
331             If there was some unconverted bytes on the last cycle then they
332             were put at the beginning of the array, so total bytes available
333             to convert this cycle includes them. If we found 0 following bytes
334             then we ignore the previous partial character.
335             */
336             lGCount+=nArrOffset;
337 
338 			if( hConverter )
339 			{
340 				sal_uInt32 nInfo;
341 				sal_Size nNewLen = lGCount, nCntBytes;
342                 sal_Unicode* pBuf = sWork.AllocBuffer( static_cast< xub_StrLen >(nNewLen) );
343 
344 				nNewLen = rtl_convertTextToUnicode( hConverter, hContext,
345 								pArr, lGCount, pBuf, nNewLen,
346 								(
347 								RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
348 								RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
349 								RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT |
350                                 RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
351 								),
352 								&nInfo,
353 								&nCntBytes );
354 				if( 0 != ( nArrOffset = lGCount - nCntBytes ) )
355 					memmove( pArr, pArr + nCntBytes, nArrOffset );
356                 sWork.ReleaseBufferAccess( static_cast< xub_StrLen >(nNewLen) );
357 
358 				pStt = pLastStt = sWork.GetBufferAccess();
359 				pEnd = pStt + nNewLen;
360 			}
361 			else
362 			{
363 				pStt = pLastStt = (sal_Unicode*)pArr;
364 				pEnd = (sal_Unicode*)(pArr + lGCount);
365 
366 				if( bSwapUnicode )
367 				{
368 					sal_Char* pF = pArr, *pN = pArr + 1;
369 					for( sal_uLong n = 0; n < lGCount; n += 2, pF += 2, pN += 2 )
370 					{
371 						sal_Char c = *pF;
372 						*pF = *pN;
373 						*pN = c;
374 					}
375 				}
376 			}
377 
378 			*pEnd = 0;
379 			nReadCnt += lGCount;
380 
381 			::SetProgressState( nReadCnt, pDoc->GetDocShell() );
382 
383 			if( cLastCR )
384 			{
385 				if( 0x0a == *pStt && 0x0d == cLastCR )
386 					pLastStt = ++pStt;
387 				cLastCR = 0;
388 				nLineLen = 0;
389 				// JP 03.04.96: das letze am Ende nehmen wir nicht
390 				if( !rInput.IsEof() || !(pEnd == pStt ||
391 					( !*pEnd && pEnd == pStt+1 ) ) )
392 					pDoc->SplitNode( *pPam->GetPoint(), false );
393 			}
394 		}
395 
396 		bool bIns = true, bSplitNode = false;
397 		switch( *pStt )
398 		{
399 //JP 12.11.2001: task 94636 - don't ignore all behind the zero character,
400 //							  change it to the default "control character"
401 //		case 0:
402 //					pEnd = pStt;
403 //					bIns = false ;
404 //					break;
405 
406 		case 0x0a:	if( LINEEND_LF == pUseMe->GetParaFlags() )
407 					{
408 						bIns = false;
409 						*pStt = 0;
410 						++pStt;
411 
412 						// JP 03.04.96: das letze am Ende nehmen wir nicht
413 						if( !rInput.IsEof() || pEnd != pStt )
414 							bSplitNode = true;
415 					}
416 					break;
417 
418 		case 0x0d:	if( LINEEND_LF != pUseMe->GetParaFlags() )
419 					{
420 						bIns = false;
421 						*pStt = 0;
422 						++pStt;
423 
424 						bool bChkSplit = false;
425 						if( LINEEND_CRLF == pUseMe->GetParaFlags() )
426 						{
427 							if( pStt == pEnd )
428 								cLastCR = 0x0d;
429 							else if( 0x0a == *pStt )
430 							{
431 								++pStt;
432 								bChkSplit = true;
433 							}
434 						}
435 						else
436 							bChkSplit = true;
437 
438 							// JP 03.04.96: das letze am Ende nehmen wir nicht
439 						if( bChkSplit && ( !rInput.IsEof() || pEnd != pStt ))
440 							bSplitNode = true;
441 					}
442 					break;
443 
444 		case 0x0c:
445 					{
446 						// dann mal einen harten Seitenumbruch einfuegen
447 						*pStt++ = 0;
448 						if( nLineLen )
449 						{
450 							// Change to charset system!!!!
451 							//rOpt.GetCharSet();
452 							InsertText( String( pLastStt ));
453 						}
454 						pDoc->SplitNode( *pPam->GetPoint(), false );
455                         pDoc->InsertPoolItem(
456                             *pPam, SvxFmtBreakItem( SVX_BREAK_PAGE_BEFORE, RES_BREAK ), 0);
457 						pLastStt = pStt;
458 						nLineLen = 0;
459 						bIns = false;
460 					}
461 					break;
462 
463 		case 0x1a:
464 					if( nReadCnt == nFileSize && pStt+1 == pEnd )
465 						*pStt = 0;
466 					else
467 						*pStt = '#';        // Ersatzdarstellung
468 					break;
469 
470 		case '\t':	break;
471 
472 		default:
473 			if( ' ' > *pStt )
474 					// Ctrl-Zchn gefunden ersetze durch '#'
475 				*pStt = '#';
476 			break;
477 		}
478 
479 		if( bIns )
480 		{
481 			if( ( nLineLen >= MAX_ASCII_PARA - 100 ) &&
482 				( ( *pStt == ' ' ) || ( nLineLen >= MAX_ASCII_PARA - 1 ) ) )
483 			{
484 				sal_Unicode c = *pStt;
485 				*pStt = 0;
486 				InsertText( String( pLastStt ));
487 				pDoc->SplitNode( *pPam->GetPoint(), false );
488 				pLastStt = pStt;
489 				nLineLen = 0;
490 				*pStt = c;
491 			}
492 			++pStt;
493 			++nLineLen;
494 		}
495 		else if( bSplitNode )
496 		{
497 			// es wurde ein CR/LF erkannt, also speichere den Text
498 
499 			InsertText( String( pLastStt ));
500 			pDoc->SplitNode( *pPam->GetPoint(), false );
501 			pLastStt = pStt;
502 			nLineLen = 0;
503 		}
504 	} while(true);
505 
506 	if( hConverter )
507 	{
508 		rtl_destroyTextToUnicodeContext( hConverter, hContext );
509 		rtl_destroyTextToUnicodeConverter( hConverter );
510 	}
511 	return 0;
512 }
513 
InsertText(const String & rStr)514 void SwASCIIParser::InsertText( const String& rStr )
515 {
516     pDoc->InsertString( *pPam, rStr );
517 	if( pItemSet && pBreakIt && nScript != ( SCRIPTTYPE_LATIN |
518 											 SCRIPTTYPE_ASIAN |
519 											 SCRIPTTYPE_COMPLEX ) )
520 		nScript |= pBreakIt->GetAllScriptsOfText( rStr );
521 }
522 
523 /* vi:set tabstop=4 shiftwidth=4 expandtab: */
524