xref: /trunk/main/sw/source/filter/ascii/parasc.cxx (revision ce48dd1f)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_sw.hxx"
26 
27 #include <tools/stream.hxx>
28 #include <hintids.hxx>
29 #include <rtl/tencinfo.h>
30 #include <sfx2/printer.hxx>
31 #include <editeng/fontitem.hxx>
32 #include <editeng/langitem.hxx>
33 #include <editeng/brkitem.hxx>
34 #include <editeng/scripttypeitem.hxx>
35 #include <shellio.hxx>
36 #include <doc.hxx>
37 #include <swtypes.hxx>
38 #include <ndtxt.hxx>
39 #include <pam.hxx>
40 #include <frmatr.hxx>
41 #include <fltini.hxx>
42 #include <pagedesc.hxx>
43 #include <breakit.hxx>
44 #include <swerror.h>
45 #ifndef _STATSTR_HRC
46 #include <statstr.hrc> // ResId fuer Statusleiste
47 #endif
48 #include <mdiexp.hxx> // ...Percent()
49 #include <poolfmt.hxx>
50 
51 #include "vcl/metric.hxx"
52 
53 #define ASC_BUFFLEN 4096
54 
55 class SwASCIIParser
56 {
57 	SwDoc* pDoc;
58 	SwPaM* pPam;
59 	SvStream& rInput;
60 	sal_Char* pArr;
61 	const SwAsciiOptions& rOpt;
62 	SfxItemSet* pItemSet;
63 	long nFileSize;
64 	sal_uInt16 nScript;
65 	bool bNewDoc;
66 
67 	sal_uLong ReadChars();
68 	void InsertText( const String& rStr );
69 
70 public:
71 	SwASCIIParser( SwDoc* pD, const SwPaM& rCrsr, SvStream& rIn,
72 							int bReadNewDoc, const SwAsciiOptions& rOpts );
73 	~SwASCIIParser();
74 
75 	sal_uLong CallParser();
76 };
77 
78 
79 // Aufruf fuer die allg. Reader-Schnittstelle
Read(SwDoc & rDoc,const String &,SwPaM & rPam,const String &)80 sal_uLong AsciiReader::Read( SwDoc &rDoc, const String&, SwPaM &rPam, const String & )
81 {
82 	if( !pStrm )
83 	{
84 		ASSERT( sal_False, "ASCII-Read without stream" );
85 		return ERR_SWG_READ_ERROR;
86 	}
87 
88 	//JP 18.01.96: Alle Ueberschriften sind normalerweise ohne
89 	//              Kapitelnummer. Darum hier explizit abschalten
90 	//              weil das Default jetzt wieder auf AN ist.
91 	if( !bInsertMode )
92 		Reader::SetNoOutlineNum( rDoc );
93 
94 	SwASCIIParser* pParser = new SwASCIIParser( &rDoc, rPam, *pStrm,
95 										!bInsertMode, aOpt.GetASCIIOpts() );
96 	sal_uLong nRet = pParser->CallParser();
97 
98 	delete pParser;
99 	// after Read reset the options
100 	aOpt.ResetASCIIOpts();
101 	return nRet;
102 }
103 
SwASCIIParser(SwDoc * pD,const SwPaM & rCrsr,SvStream & rIn,int bReadNewDoc,const SwAsciiOptions & rOpts)104 SwASCIIParser::SwASCIIParser(SwDoc* pD, const SwPaM& rCrsr, SvStream& rIn,
105 	int bReadNewDoc, const SwAsciiOptions& rOpts)
106 	: pDoc(pD), rInput(rIn), rOpt(rOpts), nScript(0), bNewDoc(bReadNewDoc)
107 {
108 	pPam = new SwPaM( *rCrsr.GetPoint() );
109 	pArr = new sal_Char [ ASC_BUFFLEN + 2 ];
110 
111 	pItemSet = new SfxItemSet( pDoc->GetAttrPool(),
112 				RES_CHRATR_FONT,		RES_CHRATR_LANGUAGE,
113 				RES_CHRATR_CJK_FONT,	RES_CHRATR_CJK_LANGUAGE,
114 				RES_CHRATR_CTL_FONT,	RES_CHRATR_CTL_LANGUAGE,
115 				0 );
116 
117 	// set defaults from the options
118 	if( rOpt.GetLanguage() )
119 	{
120 		SvxLanguageItem aLang( (LanguageType)rOpt.GetLanguage(),
121 						 		RES_CHRATR_LANGUAGE );
122 		pItemSet->Put( aLang );
123 		pItemSet->Put( aLang, RES_CHRATR_CJK_LANGUAGE );
124 		pItemSet->Put( aLang, RES_CHRATR_CTL_LANGUAGE );
125 	}
126 	if( rOpt.GetFontName().Len() )
127 	{
128 		Font aTextFont( rOpt.GetFontName(), Size( 0, 10 ) );
129 		if( pDoc->getPrinter( false ) )
130 			aTextFont = pDoc->getPrinter( false )->GetFontMetric( aTextFont );
131 		SvxFontItem aFont( aTextFont.GetFamily(), aTextFont.GetName(),
132 						   aEmptyStr, aTextFont.GetPitch(), aTextFont.GetCharSet(), RES_CHRATR_FONT );
133 		pItemSet->Put( aFont );
134 		pItemSet->Put( aFont, RES_CHRATR_CJK_FONT );
135 		pItemSet->Put( aFont, RES_CHRATR_CTL_FONT );
136 	}
137 }
138 
~SwASCIIParser()139 SwASCIIParser::~SwASCIIParser()
140 {
141 	delete pPam;
142 	delete [] pArr;
143 	delete pItemSet;
144 }
145 
146 
147 // Aufruf des Parsers
CallParser()148 sal_uLong SwASCIIParser::CallParser()
149 {
150 	rInput.Seek(STREAM_SEEK_TO_END);
151 	rInput.ResetError();
152 
153 	nFileSize = rInput.Tell();
154 	rInput.Seek(STREAM_SEEK_TO_BEGIN);
155 	rInput.ResetError();
156 
157 	::StartProgress( STR_STATSTR_W4WREAD, 0, nFileSize, pDoc->GetDocShell() );
158 
159 	SwPaM* pInsPam = 0;
160 	xub_StrLen nSttCntnt = 0;
161 	if (!bNewDoc)
162 	{
163 		const SwNodeIndex& rTmp = pPam->GetPoint()->nNode;
164 		pInsPam = new SwPaM( rTmp, rTmp, 0, -1 );
165 		nSttCntnt = pPam->GetPoint()->nContent.GetIndex();
166 	}
167 
168 	SwTxtFmtColl *pColl = 0;
169 
170 	if (bNewDoc)
171 	{
172 		pColl = pDoc->GetTxtCollFromPool(RES_POOLCOLL_HTML_PRE, false);
173 		if (!pColl)
174 			pColl = pDoc->GetTxtCollFromPool(RES_POOLCOLL_STANDARD,false);
175 		if (pColl)
176 			pDoc->SetTxtFmtColl(*pPam, pColl);
177 	}
178 
179 	sal_uLong nError = ReadChars();
180 
181 	if( pItemSet )
182 	{
183 		// set only the attribute, for scanned scripts.
184 		if( !( SCRIPTTYPE_LATIN & nScript ))
185 		{
186 			pItemSet->ClearItem( RES_CHRATR_FONT );
187 			pItemSet->ClearItem( RES_CHRATR_LANGUAGE );
188 		}
189 		if( !( SCRIPTTYPE_ASIAN & nScript ))
190 		{
191 			pItemSet->ClearItem( RES_CHRATR_CJK_FONT );
192 			pItemSet->ClearItem( RES_CHRATR_CJK_LANGUAGE );
193 		}
194 		if( !( SCRIPTTYPE_COMPLEX & nScript ))
195 		{
196 			pItemSet->ClearItem( RES_CHRATR_CTL_FONT );
197 			pItemSet->ClearItem( RES_CHRATR_CTL_LANGUAGE );
198 		}
199 		if( pItemSet->Count() )
200 		{
201 			if( bNewDoc )
202 			{
203 				if (pColl)
204 				{
205 					// Using the pool defaults for the font causes significant
206 					// trouble for the HTML filter, because it is not able
207 					// to export the pool defaults (or to be more precise:
208 					// the HTML filter is not able to detect whether a pool
209 					// default has changed or not. Even a comparison with the
210 					// HTMLi template does not work, because the defaults are
211 					// not copied when a new doc is created. The result of
212 					// comparing pool defaults therefor would be that the
213 					// defaults are exported always if the have changed for
214 					// text documents in general. That's not sensible, as well
215 					// as it is not sensible to export them always.
216 					sal_uInt16 aWhichIds[4] =
217 					{
218 						RES_CHRATR_FONT, RES_CHRATR_CJK_FONT,
219 						RES_CHRATR_CTL_FONT, 0
220 					};
221 					sal_uInt16 *pWhichIds = aWhichIds;
222 					while (*pWhichIds)
223 					{
224 						const SfxPoolItem *pItem;
225 						if (SFX_ITEM_SET == pItemSet->GetItemState(*pWhichIds,
226 							false, &pItem))
227 						{
228 							pColl->SetFmtAttr( *pItem );
229 							pItemSet->ClearItem( *pWhichIds );
230 						}
231 						++pWhichIds;
232 					}
233 				}
234 				if (pItemSet->Count())
235 					pDoc->SetDefault(*pItemSet);
236 			}
237 			else if( pInsPam )
238 			{
239 				// then set over the insert range the defined attributes
240 				*pInsPam->GetMark() = *pPam->GetPoint();
241 				pInsPam->GetPoint()->nNode++;
242 				pInsPam->GetPoint()->nContent.Assign(
243 									pInsPam->GetCntntNode(), nSttCntnt );
244 
245 				// !!!!!
246 				ASSERT( sal_False, "Have to change - hard attr. to para. style" );
247 				pDoc->InsertItemSet( *pInsPam, *pItemSet, 0 );
248 			}
249 		}
250 		delete pItemSet, pItemSet = 0;
251 	}
252 
253 	if( pInsPam )
254 		delete pInsPam;
255 
256 	::EndProgress( pDoc->GetDocShell() );
257 	return nError;
258 }
259 
ReadChars()260 sal_uLong SwASCIIParser::ReadChars()
261 {
262 	sal_Unicode *pStt = 0, *pEnd = 0, *pLastStt = 0;
263 	long nReadCnt = 0, nLineLen = 0;
264 	sal_Unicode cLastCR = 0;
265 	bool bSwapUnicode = false;
266 
267 	const SwAsciiOptions *pUseMe=&rOpt;
268 	SwAsciiOptions aEmpty;
269 	if (nFileSize >= 2 &&
270 		aEmpty.GetFontName() == rOpt.GetFontName() &&
271 		aEmpty.GetCharSet() == rOpt.GetCharSet() &&
272 		aEmpty.GetLanguage() == rOpt.GetLanguage() &&
273 		aEmpty.GetParaFlags() == rOpt.GetParaFlags())
274 	{
275 		sal_uLong nLen, nOrig;
276 		nOrig = nLen = rInput.Read(pArr, ASC_BUFFLEN);
277 		CharSet eCharSet;
278 		bool bRet = SwIoSystem::IsDetectableText(pArr, nLen, &eCharSet, &bSwapUnicode);
279 		ASSERT(bRet, "Autodetect of text import without nag dialog must "
280 			"have failed");
281 		if (bRet && eCharSet != RTL_TEXTENCODING_DONTKNOW)
282 		{
283 			aEmpty.SetCharSet(eCharSet);
284 			rInput.SeekRel(-(long(nLen)));
285 		}
286 		else
287 			rInput.SeekRel(-(long(nOrig)));
288 		pUseMe=&aEmpty;
289 	}
290 
291 	rtl_TextToUnicodeConverter hConverter=0;
292 	rtl_TextToUnicodeContext hContext=0;
293 	CharSet currentCharSet = pUseMe->GetCharSet();
294 	if (RTL_TEXTENCODING_UCS2 != currentCharSet)
295 	{
296 		if( currentCharSet == RTL_TEXTENCODING_DONTKNOW )
297 				currentCharSet = RTL_TEXTENCODING_ASCII_US;
298 		hConverter = rtl_createTextToUnicodeConverter( currentCharSet );
299 		ASSERT( hConverter, "no string convert available" );
300 		if (!hConverter)
301 			return ERROR_SW_READ_BASE;
302 		bSwapUnicode = false;
303 		hContext = rtl_createTextToUnicodeContext( hConverter );
304 	}
305 	else if (pUseMe != &aEmpty) // Already successfully figured out type
306 	{
307 		rInput.StartReadingUnicodeText( currentCharSet );
308 		bSwapUnicode = rInput.IsEndianSwap();
309 	}
310 
311 	String sWork;
312 	sal_uLong nArrOffset = 0;
313 
314 	do {
315 		if( pStt >= pEnd )
316 		{
317 			if( pLastStt != pStt )
318 				InsertText( String( pLastStt ));
319 
320 			// lese einen neuen Block ein
321 			sal_uLong lGCount;
322 			if( SVSTREAM_OK != rInput.GetError() || 0 == (lGCount =
323 						rInput.Read( pArr + nArrOffset,
324 									 ASC_BUFFLEN - nArrOffset )))
325 				break;		// aus der WHILE-Schleife heraus
326 
327 			/*
328 			#98380#
329 			If there was some unconverted bytes on the last cycle then they
330 			were put at the beginning of the array, so total bytes available
331 			to convert this cycle includes them. If we found 0 following bytes
332 			then we ignore the previous partial character.
333 			*/
334 			lGCount+=nArrOffset;
335 
336 			if( hConverter )
337 			{
338 				sal_uInt32 nInfo;
339 				sal_Size nNewLen = lGCount, nCntBytes;
340 				sal_Unicode* pBuf = sWork.AllocBuffer( static_cast< xub_StrLen >(nNewLen) );
341 
342 				nNewLen = rtl_convertTextToUnicode( hConverter, hContext,
343 								pArr, lGCount, pBuf, nNewLen,
344 								(
345 								RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
346 								RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
347 								RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT |
348 								RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
349 								),
350 								&nInfo,
351 								&nCntBytes );
352 				if( 0 != ( nArrOffset = lGCount - nCntBytes ) )
353 					memmove( pArr, pArr + nCntBytes, nArrOffset );
354 				sWork.ReleaseBufferAccess( static_cast< xub_StrLen >(nNewLen) );
355 
356 				pStt = pLastStt = sWork.GetBufferAccess();
357 				pEnd = pStt + nNewLen;
358 			}
359 			else
360 			{
361 				pStt = pLastStt = (sal_Unicode*)pArr;
362 				pEnd = (sal_Unicode*)(pArr + lGCount);
363 
364 				if( bSwapUnicode )
365 				{
366 					sal_Char* pF = pArr, *pN = pArr + 1;
367 					for( sal_uLong n = 0; n < lGCount; n += 2, pF += 2, pN += 2 )
368 					{
369 						sal_Char c = *pF;
370 						*pF = *pN;
371 						*pN = c;
372 					}
373 				}
374 			}
375 
376 			*pEnd = 0;
377 			nReadCnt += lGCount;
378 
379 			::SetProgressState( nReadCnt, pDoc->GetDocShell() );
380 
381 			if( cLastCR )
382 			{
383 				if( 0x0a == *pStt && 0x0d == cLastCR )
384 					pLastStt = ++pStt;
385 				cLastCR = 0;
386 				nLineLen = 0;
387 				// JP 03.04.96: das letzte am Ende nehmen wir nicht
388 				if( !rInput.IsEof() || !(pEnd == pStt ||
389 					( !*pEnd && pEnd == pStt+1 ) ) )
390 					pDoc->SplitNode( *pPam->GetPoint(), false );
391 			}
392 		}
393 
394 		bool bIns = true, bSplitNode = false;
395 		switch( *pStt )
396 		{
397 //JP 12.11.2001: task 94636 - don't ignore all behind the zero character,
398 //							  change it to the default "control character"
399 //		case 0:
400 //					pEnd = pStt;
401 //					bIns = false ;
402 //					break;
403 
404 		case 0x0a:	if( LINEEND_LF == pUseMe->GetParaFlags() )
405 					{
406 						bIns = false;
407 						*pStt = 0;
408 						++pStt;
409 
410 						// JP 03.04.96: das letzte am Ende nehmen wir nicht
411 						if( !rInput.IsEof() || pEnd != pStt )
412 							bSplitNode = true;
413 					}
414 					break;
415 
416 		case 0x0d:	if( LINEEND_LF != pUseMe->GetParaFlags() )
417 					{
418 						bIns = false;
419 						*pStt = 0;
420 						++pStt;
421 
422 						bool bChkSplit = false;
423 						if( LINEEND_CRLF == pUseMe->GetParaFlags() )
424 						{
425 							if( pStt == pEnd )
426 								cLastCR = 0x0d;
427 							else if( 0x0a == *pStt )
428 							{
429 								++pStt;
430 								bChkSplit = true;
431 							}
432 						}
433 						else
434 							bChkSplit = true;
435 
436 							// JP 03.04.96: das letzte am Ende nehmen wir nicht
437 						if( bChkSplit && ( !rInput.IsEof() || pEnd != pStt ))
438 							bSplitNode = true;
439 					}
440 					break;
441 
442 		case 0x0c:
443 					{
444 						// dann mal einen harten Seitenumbruch einfuegen
445 						*pStt++ = 0;
446 						if( nLineLen )
447 						{
448 							// Change to charset system!!!!
449 							//rOpt.GetCharSet();
450 							InsertText( String( pLastStt ));
451 						}
452 						pDoc->SplitNode( *pPam->GetPoint(), false );
453 						pDoc->InsertPoolItem(
454 							*pPam, SvxFmtBreakItem( SVX_BREAK_PAGE_BEFORE, RES_BREAK ), 0);
455 						pLastStt = pStt;
456 						nLineLen = 0;
457 						bIns = false;
458 					}
459 					break;
460 
461 		case 0x1a:
462 					if( nReadCnt == nFileSize && pStt+1 == pEnd )
463 						*pStt = 0;
464 					else
465 						*pStt = '#'; // Ersatzdarstellung
466 					break;
467 
468 		case '\t':	break;
469 
470 		default:
471 			if( ' ' > *pStt )
472 					// Ctrl-Zchn gefunden ersetze durch '#'
473 				*pStt = '#';
474 			break;
475 		}
476 
477 		if( bIns )
478 		{
479 			if( ( nLineLen >= MAX_ASCII_PARA - 100 ) &&
480 				( ( *pStt == ' ' ) || ( nLineLen >= MAX_ASCII_PARA - 1 ) ) )
481 			{
482 				sal_Unicode c = *pStt;
483 				*pStt = 0;
484 				InsertText( String( pLastStt ));
485 				pDoc->SplitNode( *pPam->GetPoint(), false );
486 				pLastStt = pStt;
487 				nLineLen = 0;
488 				*pStt = c;
489 			}
490 			++pStt;
491 			++nLineLen;
492 		}
493 		else if( bSplitNode )
494 		{
495 			// es wurde ein CR/LF erkannt, also speichere den Text
496 
497 			InsertText( String( pLastStt ));
498 			pDoc->SplitNode( *pPam->GetPoint(), false );
499 			pLastStt = pStt;
500 			nLineLen = 0;
501 		}
502 	} while(true);
503 
504 	if( hConverter )
505 	{
506 		rtl_destroyTextToUnicodeContext( hConverter, hContext );
507 		rtl_destroyTextToUnicodeConverter( hConverter );
508 	}
509 	return 0;
510 }
511 
InsertText(const String & rStr)512 void SwASCIIParser::InsertText( const String& rStr )
513 {
514 	pDoc->InsertString( *pPam, rStr );
515 	if( pItemSet && pBreakIt && nScript != ( SCRIPTTYPE_LATIN |
516 											 SCRIPTTYPE_ASIAN |
517 											 SCRIPTTYPE_COMPLEX ) )
518 		nScript |= pBreakIt->GetAllScriptsOfText( rStr );
519 }
520 
521 /* vim: set noet sw=4 ts=4: */
522