1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_sdext.hxx"
30 
31 #include <stdio.h>
32 #include <sal/main.h>
33 #include <osl/file.h>
34 #include <osl/thread.h>
35 #include <rtl/alloc.h>
36 #include <rtl/ustring.hxx>
37 #include <rtl/strbuf.hxx>
38 
39 #include "pdfparse.hxx"
40 
41 using namespace rtl;
42 using namespace pdfparse;
43 
44 void printHelp( const char* pExe )
45 {
46     fprintf( stdout,
47     "USAGE: %s [-h,--help]\n"
48     "       %s [-pw, --password <password>] <inputfile> [<outputfile>]\n"
49     "       %s <-a, --extract-add-streams> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
50     "       %s <-f, --extract-fonts> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
51     "       %s <-o, --extract-objects> <o0>[:<g0>][,<o1>[:g1][,...]] [-pw, --password <password>] <inputfile> [<outputfile>]\n"
52     "  -h, --help: show help\n"
53     "  -a, --extract-add-streams: extracts additional streams to outputfile_object\n"
54     "      and prints the mimetype found to stdout\n"
55     "  -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n"
56     "  -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n"
57     "      object numbers, where object number and generation number are separated by \':\'\n"
58     "      an omitted generation number defaults to 0\n"
59     "  -pw, --password: use password for decryption\n"
60     "\n"
61     "note: -f, -a, -o and normal unzip operation are mutually exclusive\n"
62     , pExe, pExe, pExe, pExe, pExe );
63 }
64 
65 class FileEmitContext : public EmitContext
66 {
67     oslFileHandle m_aHandle;
68     oslFileHandle m_aReadHandle;
69     unsigned int  m_nReadLen;
70 
71     void openReadFile( const char* pOrigName );
72 
73     public:
74     FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop );
75     virtual ~FileEmitContext();
76 
77     virtual bool write( const void* pBuf, unsigned int nLen ) throw();
78     virtual unsigned int getCurPos() throw();
79     virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw();
80     virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw();
81 };
82 
83 FileEmitContext::FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop )
84     : EmitContext( pTop ),
85       m_aHandle( NULL ),
86       m_aReadHandle( NULL ),
87       m_nReadLen( 0 )
88 {
89     OUString aSysFile( OStringToOUString( OString( pFileName ), osl_getThreadTextEncoding() ) );
90     OUString aURL;
91     if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
92     {
93         fprintf( stderr, "filename conversion \"%s\" failed\n", pFileName );
94         return;
95     }
96 
97     if( osl_openFile( aURL.pData, &m_aHandle, osl_File_OpenFlag_Write ) == osl_File_E_None )
98     {
99         if( osl_setFileSize( m_aHandle, 0 ) != osl_File_E_None )
100         {
101             fprintf( stderr, "could not truncate %s\n", pFileName );
102             osl_closeFile( m_aHandle );
103             m_aHandle = NULL;
104         }
105     }
106     else if( osl_openFile( aURL.pData, &m_aHandle,
107             osl_File_OpenFlag_Write |osl_File_OpenFlag_Create ) != osl_File_E_None )
108     {
109         fprintf( stderr, "could not open %s\n", pFileName );
110         return;
111     }
112     m_bDeflate = true;
113 
114     openReadFile( pOrigName );
115 }
116 
117 FileEmitContext::~FileEmitContext()
118 {
119     if( m_aHandle )
120         osl_closeFile( m_aHandle );
121     if( m_aReadHandle )
122         osl_closeFile( m_aReadHandle );
123 }
124 
125 void FileEmitContext::openReadFile( const char* pInFile )
126 {
127     OUString aSysFile( OStringToOUString( OString( pInFile ), osl_getThreadTextEncoding() ) );
128     OUString aURL;
129     if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
130     {
131         fprintf( stderr, "filename conversion \"%s\" failed\n", pInFile );
132         return;
133     }
134 
135     if( osl_openFile( aURL.pData, &m_aReadHandle, osl_File_OpenFlag_Read ) != osl_File_E_None )
136     {
137         fprintf( stderr, "could not open %s\n", pInFile );
138         return;
139     }
140 
141     if( osl_setFilePos( m_aReadHandle, osl_Pos_End, 0 ) != osl_File_E_None )
142     {
143         fprintf( stderr, "could not seek to end of %s\n", pInFile );
144         osl_closeFile( m_aReadHandle );
145         return;
146     }
147 
148     sal_uInt64 nFileSize = 0;
149     if( osl_getFilePos( m_aReadHandle, &nFileSize ) != osl_File_E_None )
150     {
151         fprintf( stderr, "could not get end pos of %s\n", pInFile );
152         osl_closeFile( m_aReadHandle );
153         return;
154     }
155 
156     m_nReadLen = static_cast<unsigned int>(nFileSize);
157 }
158 
159 bool FileEmitContext::write( const void* pBuf, unsigned int nLen ) throw()
160 {
161     if( ! m_aHandle )
162         return false;
163 
164     sal_uInt64 nWrite = static_cast<sal_uInt64>(nLen);
165     sal_uInt64 nWritten = 0;
166     return (osl_writeFile( m_aHandle, pBuf, nWrite, &nWritten ) == osl_File_E_None)
167            && nWrite == nWritten;
168 }
169 
170 unsigned int FileEmitContext::getCurPos() throw()
171 {
172     sal_uInt64 nFileSize = 0;
173     if( m_aHandle )
174     {
175         if( osl_getFilePos( m_aHandle, &nFileSize ) != osl_File_E_None )
176             nFileSize = 0;
177     }
178     return static_cast<unsigned int>(nFileSize);
179 }
180 
181 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw()
182 {
183     if( nOrigOffset + nLen > m_nReadLen )
184         return false;
185 
186     if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
187     {
188         fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
189         return false;
190     }
191     void* pBuf = rtl_allocateMemory( nLen );
192     if( ! pBuf )
193         return false;
194     sal_uInt64 nBytesRead = 0;
195     if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None
196         || nBytesRead != static_cast<sal_uInt64>(nLen) )
197     {
198         fprintf( stderr, "could not read %u bytes\n", nLen );
199         rtl_freeMemory( pBuf );
200         return false;
201     }
202     bool bRet = write( pBuf, nLen );
203     rtl_freeMemory( pBuf );
204     return bRet;
205 }
206 
207 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw()
208 {
209     if( nOrigOffset + nLen > m_nReadLen )
210         return 0;
211 
212     if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
213     {
214         fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
215         return 0;
216     }
217     sal_uInt64 nBytesRead = 0;
218     if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None )
219         return 0;
220     return static_cast<unsigned int>(nBytesRead);
221 }
222 
223 typedef int(*PDFFileHdl)(const char*, const char*, PDFFile*);
224 
225 int handleFile( const char* pInFile, const char* pOutFile, const char* pPassword, PDFFileHdl pHdl )
226 {
227 
228     PDFReader aParser;
229     int nRet = 0;
230     PDFEntry* pEntry = aParser.read( pInFile );
231     if( pEntry )
232     {
233         PDFFile* pPDFFile = dynamic_cast<PDFFile*>(pEntry);
234         if( pPDFFile )
235         {
236             fprintf( stdout, "have a %s PDF file\n", pPDFFile->isEncrypted() ? "encrypted" : "unencrypted" );
237             if( pPassword )
238                 fprintf( stdout, "password %s\n",
239                          pPDFFile->setupDecryptionData( pPassword ) ? "matches" : "does not match" );
240             nRet = pHdl( pInFile, pOutFile, pPDFFile );
241         }
242         else
243             nRet = 20;
244         delete pEntry;
245     }
246     return nRet;
247 }
248 
249 int write_unzipFile( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
250 {
251     FileEmitContext aContext( pOutFile, pInFile, pPDFFile );
252     aContext.m_bDecrypt = pPDFFile->isEncrypted();
253     pPDFFile->emit(aContext);
254     return 0;
255 }
256 
257 int write_addStreamArray( const char* pOutFile, PDFArray* pStreams, PDFFile* pPDFFile, const char* pInFile )
258 {
259     int nRet = 0;
260     unsigned int nArrayElements = pStreams->m_aSubElements.size();
261     for( unsigned int i = 0; i < nArrayElements-1 && nRet == 0; i++ )
262     {
263         PDFName* pMimeType = dynamic_cast<PDFName*>(pStreams->m_aSubElements[i]);
264         PDFObjectRef* pStreamRef = dynamic_cast<PDFObjectRef*>(pStreams->m_aSubElements[i+1]);
265         if( ! pMimeType )
266             fprintf( stderr, "error: no mimetype element\n" );
267         if( ! pStreamRef )
268             fprintf( stderr, "error: no stream ref element\n" );
269         if( pMimeType && pStreamRef )
270         {
271             fprintf( stdout, "found stream %d %d with mimetype %s\n",
272                      pStreamRef->m_nNumber, pStreamRef->m_nGeneration,
273                      pMimeType->m_aName.getStr() );
274             PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration );
275             if( pObject )
276             {
277                 rtl::OStringBuffer aOutStream( pOutFile );
278                 aOutStream.append( "_stream_" );
279                 aOutStream.append( sal_Int32(pStreamRef->m_nNumber) );
280                 aOutStream.append( "_" );
281                 aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) );
282                 FileEmitContext aContext( aOutStream.getStr(), pInFile, pPDFFile );
283                 aContext.m_bDecrypt = pPDFFile->isEncrypted();
284                 pObject->writeStream( aContext, pPDFFile );
285             }
286             else
287             {
288                 fprintf( stderr, "object not found\n" );
289                 nRet = 121;
290             }
291         }
292         else
293             nRet = 120;
294     }
295     return nRet;
296 }
297 
298 int write_addStreams( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
299 {
300     // find all trailers
301     int nRet = 0;
302     unsigned int nElements = pPDFFile->m_aSubElements.size();
303     for( unsigned i = 0; i < nElements && nRet == 0; i++ )
304     {
305         PDFTrailer* pTrailer = dynamic_cast<PDFTrailer*>(pPDFFile->m_aSubElements[i]);
306         if( pTrailer && pTrailer->m_pDict )
307         {
308             // search for AdditionalStreams entry
309             std::hash_map<rtl::OString,PDFEntry*,rtl::OStringHash>::iterator add_stream;
310             add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" );
311             if( add_stream != pTrailer->m_pDict->m_aMap.end() )
312             {
313                 PDFArray* pStreams = dynamic_cast<PDFArray*>(add_stream->second);
314                 if( pStreams )
315                     nRet = write_addStreamArray( pOutFile, pStreams, pPDFFile, pInFile );
316             }
317         }
318     }
319     return nRet;
320 }
321 
322 int write_fonts( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
323 {
324     int nRet = 0;
325     unsigned int nElements = i_pPDFFile->m_aSubElements.size();
326     for( unsigned i = 0; i < nElements && nRet == 0; i++ )
327     {
328         // search FontDescriptors
329         PDFObject* pObj = dynamic_cast<PDFObject*>(i_pPDFFile->m_aSubElements[i]);
330         if( ! pObj )
331             continue;
332         PDFDict* pDict = dynamic_cast<PDFDict*>(pObj->m_pObject);
333         if( ! pDict )
334             continue;
335 
336         std::hash_map<rtl::OString,PDFEntry*,rtl::OStringHash>::iterator map_it =
337                 pDict->m_aMap.find( "Type" );
338         if( map_it == pDict->m_aMap.end() )
339             continue;
340 
341         PDFName* pName = dynamic_cast<PDFName*>(map_it->second);
342         if( ! pName )
343             continue;
344         if( ! pName->m_aName.equals( "FontDescriptor" ) )
345             continue;
346 
347         // the font name will be helpful, also there must be one in
348         // a font descriptor
349         map_it = pDict->m_aMap.find( "FontName" );
350         if( map_it == pDict->m_aMap.end() )
351             continue;
352         pName = dynamic_cast<PDFName*>(map_it->second);
353         if( ! pName )
354             continue;
355         rtl::OString aFontName( pName->m_aName );
356 
357         PDFObjectRef* pStreamRef = 0;
358         const char* pFileType = NULL;
359         // we have a font descriptor, try for a type 1 font
360         map_it = pDict->m_aMap.find( "FontFile" );
361         if( map_it != pDict->m_aMap.end() )
362         {
363             pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
364             if( pStreamRef )
365                 pFileType = "pfa";
366         }
367 
368         // perhaps it's a truetype file ?
369         if( ! pStreamRef )
370         {
371             map_it  = pDict->m_aMap.find( "FontFile2" );
372             if( map_it != pDict->m_aMap.end() )
373             {
374                 pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
375                 if( pStreamRef )
376                     pFileType = "ttf";
377             }
378         }
379 
380         if( ! pStreamRef )
381             continue;
382 
383         PDFObject* pStream = i_pPDFFile->findObject( pStreamRef );
384         if( ! pStream )
385             continue;
386 
387         rtl::OStringBuffer aOutStream( i_pOutFile );
388         aOutStream.append( "_font_" );
389         aOutStream.append( sal_Int32(pStreamRef->m_nNumber) );
390         aOutStream.append( "_" );
391         aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) );
392         aOutStream.append( "_" );
393         aOutStream.append( aFontName );
394         if( pFileType )
395         {
396             aOutStream.append( "." );
397             aOutStream.append( pFileType );
398         }
399         FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
400         aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
401         pStream->writeStream( aContext, i_pPDFFile );
402     }
403     return nRet;
404 }
405 
406 std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects;
407 
408 int write_objects( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
409 {
410     int nRet = 0;
411     unsigned int nElements = s_aEmitObjects.size();
412     for( unsigned i = 0; i < nElements && nRet == 0; i++ )
413     {
414         sal_Int32 nObject     = s_aEmitObjects[i].first;
415         sal_Int32 nGeneration = s_aEmitObjects[i].second;
416         PDFObject* pStream = i_pPDFFile->findObject( nObject, nGeneration );
417         if( ! pStream )
418         {
419             fprintf( stderr, "object %d %d not found !\n", (int)nObject, (int)nGeneration );
420             continue;
421         }
422 
423         rtl::OStringBuffer aOutStream( i_pOutFile );
424         aOutStream.append( "_stream_" );
425         aOutStream.append( nObject );
426         aOutStream.append( "_" );
427         aOutStream.append( nGeneration );
428         FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
429         aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
430         pStream->writeStream( aContext, i_pPDFFile );
431     }
432     return nRet;
433 }
434 
435 SAL_IMPLEMENT_MAIN_WITH_ARGS( argc, argv )
436 {
437     const char* pInFile = NULL;
438     const char* pOutFile = NULL;
439     const char* pPassword = NULL;
440     OStringBuffer aOutFile( 256 );
441     PDFFileHdl aHdl = write_unzipFile;
442 
443     for( int nArg = 1; nArg < argc; nArg++ )
444     {
445         if( argv[nArg][0] == '-' )
446         {
447             if( ! rtl_str_compare( "-pw", argv[nArg] ) ||
448                 ! rtl_str_compare( "--password" , argv[nArg] ) )
449             {
450                 if( nArg == argc-1 )
451                 {
452                     fprintf( stderr, "no password given\n" );
453                     return 1;
454                 }
455                 nArg++;
456                 pPassword = argv[nArg];
457             }
458             else if( ! rtl_str_compare( "-h", argv[nArg] ) ||
459                 ! rtl_str_compare( "--help", argv[nArg] ) )
460             {
461                 printHelp( argv[0] );
462                 return 0;
463             }
464             else if( ! rtl_str_compare( "-a", argv[nArg] ) ||
465                 ! rtl_str_compare( "--extract-add-streams", argv[nArg] ) )
466             {
467                 aHdl = write_addStreams;
468             }
469             else if( ! rtl_str_compare( "-f", argv[nArg] ) ||
470                 ! rtl_str_compare( "--extract-fonts", argv[nArg] ) )
471             {
472                 aHdl = write_fonts;
473             }
474             else if( ! rtl_str_compare( "-o", argv[nArg] ) ||
475                 ! rtl_str_compare( "--extract-objects", argv[nArg] ) )
476             {
477                 aHdl = write_objects;
478                 nArg++;
479                 if( nArg < argc )
480                 {
481                     rtl::OString aObjs( argv[nArg] );
482                     sal_Int32 nIndex = 0;
483                     while( nIndex != -1 )
484                     {
485                         rtl::OString aToken( aObjs.getToken( 0, ',', nIndex ) );
486                         sal_Int32 nObject = 0;
487                         sal_Int32 nGeneration = 0;
488                         sal_Int32 nGenIndex = 0;
489                         nObject = aToken.getToken( 0, ':', nGenIndex ).toInt32();
490                         if( nGenIndex != -1 )
491                             nGeneration = aToken.getToken( 0, ':', nGenIndex ).toInt32();
492                         s_aEmitObjects.push_back( std::pair<sal_Int32,sal_Int32>(nObject,nGeneration) );
493                     }
494                 }
495             }
496             else
497             {
498                 fprintf( stderr, "unrecognized option \"%s\"\n",
499                          argv[nArg] );
500                 printHelp( argv[0] );
501                 return 1;
502             }
503         }
504         else if( pInFile == NULL )
505             pInFile = argv[nArg];
506         else if( pOutFile == NULL )
507             pOutFile = argv[nArg];
508     }
509     if( ! pInFile )
510     {
511         fprintf( stderr, "no input file given\n" );
512         return 10;
513     }
514     if( ! pOutFile )
515     {
516         OString aFile( pInFile );
517         if( aFile.getLength() > 0 )
518         {
519             if( aFile.getLength() > 4 )
520             {
521                 if( aFile.matchIgnoreAsciiCase( OString( ".pdf" ), aFile.getLength()-4 ) )
522                     aOutFile.append( pInFile, aFile.getLength() - 4 );
523                 else
524                     aOutFile.append( aFile );
525             }
526             aOutFile.append( "_unzip.pdf" );
527             pOutFile = aOutFile.getStr();
528         }
529         else
530         {
531             fprintf( stderr, "no output file given\n" );
532             return 11;
533         }
534     }
535 
536     return handleFile( pInFile, pOutFile, pPassword, aHdl );
537 }
538 
539