1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 // MARKER(update_precomp.py): autogen include statement, do not remove 29 #include "precompiled_sdext.hxx" 30 31 #include <stdio.h> 32 #include <sal/main.h> 33 #include <osl/file.h> 34 #include <osl/thread.h> 35 #include <rtl/alloc.h> 36 #include <rtl/ustring.hxx> 37 #include <rtl/strbuf.hxx> 38 39 #include "pdfparse.hxx" 40 41 using namespace rtl; 42 using namespace pdfparse; 43 44 void printHelp( const char* pExe ) 45 { 46 fprintf( stdout, 47 "USAGE: %s [-h,--help]\n" 48 " %s [-pw, --password <password>] <inputfile> [<outputfile>]\n" 49 " %s <-a, --extract-add-streams> [-pw, --password <password>] <inputfile> [<outputfile>]\n" 50 " %s <-f, --extract-fonts> [-pw, --password <password>] <inputfile> [<outputfile>]\n" 51 " %s <-o, --extract-objects> <o0>[:<g0>][,<o1>[:g1][,...]] [-pw, --password <password>] <inputfile> [<outputfile>]\n" 52 " -h, --help: show help\n" 53 " -a, --extract-add-streams: extracts additional streams to outputfile_object\n" 54 " and prints the mimetype found to stdout\n" 55 " -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n" 56 " -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n" 57 " object numbers, where object number and generation number are separated by \':\'\n" 58 " an omitted generation number defaults to 0\n" 59 " -pw, --password: use password for decryption\n" 60 "\n" 61 "note: -f, -a, -o and normal unzip operation are mutually exclusive\n" 62 , pExe, pExe, pExe, pExe, pExe ); 63 } 64 65 class FileEmitContext : public EmitContext 66 { 67 oslFileHandle m_aHandle; 68 oslFileHandle m_aReadHandle; 69 unsigned int m_nReadLen; 70 71 void openReadFile( const char* pOrigName ); 72 73 public: 74 FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop ); 75 virtual ~FileEmitContext(); 76 77 virtual bool write( const void* pBuf, unsigned int nLen ) throw(); 78 virtual unsigned int getCurPos() throw(); 79 virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw(); 80 virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw(); 81 }; 82 83 FileEmitContext::FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop ) 84 : EmitContext( pTop ), 85 m_aHandle( NULL ), 86 m_aReadHandle( NULL ), 87 m_nReadLen( 0 ) 88 { 89 OUString aSysFile( OStringToOUString( OString( pFileName ), osl_getThreadTextEncoding() ) ); 90 OUString aURL; 91 if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None ) 92 { 93 fprintf( stderr, "filename conversion \"%s\" failed\n", pFileName ); 94 return; 95 } 96 97 if( osl_openFile( aURL.pData, &m_aHandle, osl_File_OpenFlag_Write ) == osl_File_E_None ) 98 { 99 if( osl_setFileSize( m_aHandle, 0 ) != osl_File_E_None ) 100 { 101 fprintf( stderr, "could not truncate %s\n", pFileName ); 102 osl_closeFile( m_aHandle ); 103 m_aHandle = NULL; 104 } 105 } 106 else if( osl_openFile( aURL.pData, &m_aHandle, 107 osl_File_OpenFlag_Write |osl_File_OpenFlag_Create ) != osl_File_E_None ) 108 { 109 fprintf( stderr, "could not open %s\n", pFileName ); 110 return; 111 } 112 m_bDeflate = true; 113 114 openReadFile( pOrigName ); 115 } 116 117 FileEmitContext::~FileEmitContext() 118 { 119 if( m_aHandle ) 120 osl_closeFile( m_aHandle ); 121 if( m_aReadHandle ) 122 osl_closeFile( m_aReadHandle ); 123 } 124 125 void FileEmitContext::openReadFile( const char* pInFile ) 126 { 127 OUString aSysFile( OStringToOUString( OString( pInFile ), osl_getThreadTextEncoding() ) ); 128 OUString aURL; 129 if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None ) 130 { 131 fprintf( stderr, "filename conversion \"%s\" failed\n", pInFile ); 132 return; 133 } 134 135 if( osl_openFile( aURL.pData, &m_aReadHandle, osl_File_OpenFlag_Read ) != osl_File_E_None ) 136 { 137 fprintf( stderr, "could not open %s\n", pInFile ); 138 return; 139 } 140 141 if( osl_setFilePos( m_aReadHandle, osl_Pos_End, 0 ) != osl_File_E_None ) 142 { 143 fprintf( stderr, "could not seek to end of %s\n", pInFile ); 144 osl_closeFile( m_aReadHandle ); 145 return; 146 } 147 148 sal_uInt64 nFileSize = 0; 149 if( osl_getFilePos( m_aReadHandle, &nFileSize ) != osl_File_E_None ) 150 { 151 fprintf( stderr, "could not get end pos of %s\n", pInFile ); 152 osl_closeFile( m_aReadHandle ); 153 return; 154 } 155 156 m_nReadLen = static_cast<unsigned int>(nFileSize); 157 } 158 159 bool FileEmitContext::write( const void* pBuf, unsigned int nLen ) throw() 160 { 161 if( ! m_aHandle ) 162 return false; 163 164 sal_uInt64 nWrite = static_cast<sal_uInt64>(nLen); 165 sal_uInt64 nWritten = 0; 166 return (osl_writeFile( m_aHandle, pBuf, nWrite, &nWritten ) == osl_File_E_None) 167 && nWrite == nWritten; 168 } 169 170 unsigned int FileEmitContext::getCurPos() throw() 171 { 172 sal_uInt64 nFileSize = 0; 173 if( m_aHandle ) 174 { 175 if( osl_getFilePos( m_aHandle, &nFileSize ) != osl_File_E_None ) 176 nFileSize = 0; 177 } 178 return static_cast<unsigned int>(nFileSize); 179 } 180 181 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw() 182 { 183 if( nOrigOffset + nLen > m_nReadLen ) 184 return false; 185 186 if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None ) 187 { 188 fprintf( stderr, "could not seek to offset %u\n", nOrigOffset ); 189 return false; 190 } 191 void* pBuf = rtl_allocateMemory( nLen ); 192 if( ! pBuf ) 193 return false; 194 sal_uInt64 nBytesRead = 0; 195 if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None 196 || nBytesRead != static_cast<sal_uInt64>(nLen) ) 197 { 198 fprintf( stderr, "could not read %u bytes\n", nLen ); 199 rtl_freeMemory( pBuf ); 200 return false; 201 } 202 bool bRet = write( pBuf, nLen ); 203 rtl_freeMemory( pBuf ); 204 return bRet; 205 } 206 207 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw() 208 { 209 if( nOrigOffset + nLen > m_nReadLen ) 210 return 0; 211 212 if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None ) 213 { 214 fprintf( stderr, "could not seek to offset %u\n", nOrigOffset ); 215 return 0; 216 } 217 sal_uInt64 nBytesRead = 0; 218 if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None ) 219 return 0; 220 return static_cast<unsigned int>(nBytesRead); 221 } 222 223 typedef int(*PDFFileHdl)(const char*, const char*, PDFFile*); 224 225 int handleFile( const char* pInFile, const char* pOutFile, const char* pPassword, PDFFileHdl pHdl ) 226 { 227 228 PDFReader aParser; 229 int nRet = 0; 230 PDFEntry* pEntry = aParser.read( pInFile ); 231 if( pEntry ) 232 { 233 PDFFile* pPDFFile = dynamic_cast<PDFFile*>(pEntry); 234 if( pPDFFile ) 235 { 236 fprintf( stdout, "have a %s PDF file\n", pPDFFile->isEncrypted() ? "encrypted" : "unencrypted" ); 237 if( pPassword ) 238 fprintf( stdout, "password %s\n", 239 pPDFFile->setupDecryptionData( pPassword ) ? "matches" : "does not match" ); 240 nRet = pHdl( pInFile, pOutFile, pPDFFile ); 241 } 242 else 243 nRet = 20; 244 delete pEntry; 245 } 246 return nRet; 247 } 248 249 int write_unzipFile( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile ) 250 { 251 FileEmitContext aContext( pOutFile, pInFile, pPDFFile ); 252 aContext.m_bDecrypt = pPDFFile->isEncrypted(); 253 pPDFFile->emit(aContext); 254 return 0; 255 } 256 257 int write_addStreamArray( const char* pOutFile, PDFArray* pStreams, PDFFile* pPDFFile, const char* pInFile ) 258 { 259 int nRet = 0; 260 unsigned int nArrayElements = pStreams->m_aSubElements.size(); 261 for( unsigned int i = 0; i < nArrayElements-1 && nRet == 0; i++ ) 262 { 263 PDFName* pMimeType = dynamic_cast<PDFName*>(pStreams->m_aSubElements[i]); 264 PDFObjectRef* pStreamRef = dynamic_cast<PDFObjectRef*>(pStreams->m_aSubElements[i+1]); 265 if( ! pMimeType ) 266 fprintf( stderr, "error: no mimetype element\n" ); 267 if( ! pStreamRef ) 268 fprintf( stderr, "error: no stream ref element\n" ); 269 if( pMimeType && pStreamRef ) 270 { 271 fprintf( stdout, "found stream %d %d with mimetype %s\n", 272 pStreamRef->m_nNumber, pStreamRef->m_nGeneration, 273 pMimeType->m_aName.getStr() ); 274 PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration ); 275 if( pObject ) 276 { 277 rtl::OStringBuffer aOutStream( pOutFile ); 278 aOutStream.append( "_stream_" ); 279 aOutStream.append( sal_Int32(pStreamRef->m_nNumber) ); 280 aOutStream.append( "_" ); 281 aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) ); 282 FileEmitContext aContext( aOutStream.getStr(), pInFile, pPDFFile ); 283 aContext.m_bDecrypt = pPDFFile->isEncrypted(); 284 pObject->writeStream( aContext, pPDFFile ); 285 } 286 else 287 { 288 fprintf( stderr, "object not found\n" ); 289 nRet = 121; 290 } 291 } 292 else 293 nRet = 120; 294 } 295 return nRet; 296 } 297 298 int write_addStreams( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile ) 299 { 300 // find all trailers 301 int nRet = 0; 302 unsigned int nElements = pPDFFile->m_aSubElements.size(); 303 for( unsigned i = 0; i < nElements && nRet == 0; i++ ) 304 { 305 PDFTrailer* pTrailer = dynamic_cast<PDFTrailer*>(pPDFFile->m_aSubElements[i]); 306 if( pTrailer && pTrailer->m_pDict ) 307 { 308 // search for AdditionalStreams entry 309 std::hash_map<rtl::OString,PDFEntry*,rtl::OStringHash>::iterator add_stream; 310 add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" ); 311 if( add_stream != pTrailer->m_pDict->m_aMap.end() ) 312 { 313 PDFArray* pStreams = dynamic_cast<PDFArray*>(add_stream->second); 314 if( pStreams ) 315 nRet = write_addStreamArray( pOutFile, pStreams, pPDFFile, pInFile ); 316 } 317 } 318 } 319 return nRet; 320 } 321 322 int write_fonts( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile ) 323 { 324 int nRet = 0; 325 unsigned int nElements = i_pPDFFile->m_aSubElements.size(); 326 for( unsigned i = 0; i < nElements && nRet == 0; i++ ) 327 { 328 // search FontDescriptors 329 PDFObject* pObj = dynamic_cast<PDFObject*>(i_pPDFFile->m_aSubElements[i]); 330 if( ! pObj ) 331 continue; 332 PDFDict* pDict = dynamic_cast<PDFDict*>(pObj->m_pObject); 333 if( ! pDict ) 334 continue; 335 336 std::hash_map<rtl::OString,PDFEntry*,rtl::OStringHash>::iterator map_it = 337 pDict->m_aMap.find( "Type" ); 338 if( map_it == pDict->m_aMap.end() ) 339 continue; 340 341 PDFName* pName = dynamic_cast<PDFName*>(map_it->second); 342 if( ! pName ) 343 continue; 344 if( ! pName->m_aName.equals( "FontDescriptor" ) ) 345 continue; 346 347 // the font name will be helpful, also there must be one in 348 // a font descriptor 349 map_it = pDict->m_aMap.find( "FontName" ); 350 if( map_it == pDict->m_aMap.end() ) 351 continue; 352 pName = dynamic_cast<PDFName*>(map_it->second); 353 if( ! pName ) 354 continue; 355 rtl::OString aFontName( pName->m_aName ); 356 357 PDFObjectRef* pStreamRef = 0; 358 const char* pFileType = NULL; 359 // we have a font descriptor, try for a type 1 font 360 map_it = pDict->m_aMap.find( "FontFile" ); 361 if( map_it != pDict->m_aMap.end() ) 362 { 363 pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second); 364 if( pStreamRef ) 365 pFileType = "pfa"; 366 } 367 368 // perhaps it's a truetype file ? 369 if( ! pStreamRef ) 370 { 371 map_it = pDict->m_aMap.find( "FontFile2" ); 372 if( map_it != pDict->m_aMap.end() ) 373 { 374 pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second); 375 if( pStreamRef ) 376 pFileType = "ttf"; 377 } 378 } 379 380 if( ! pStreamRef ) 381 continue; 382 383 PDFObject* pStream = i_pPDFFile->findObject( pStreamRef ); 384 if( ! pStream ) 385 continue; 386 387 rtl::OStringBuffer aOutStream( i_pOutFile ); 388 aOutStream.append( "_font_" ); 389 aOutStream.append( sal_Int32(pStreamRef->m_nNumber) ); 390 aOutStream.append( "_" ); 391 aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) ); 392 aOutStream.append( "_" ); 393 aOutStream.append( aFontName ); 394 if( pFileType ) 395 { 396 aOutStream.append( "." ); 397 aOutStream.append( pFileType ); 398 } 399 FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile ); 400 aContext.m_bDecrypt = i_pPDFFile->isEncrypted(); 401 pStream->writeStream( aContext, i_pPDFFile ); 402 } 403 return nRet; 404 } 405 406 std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects; 407 408 int write_objects( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile ) 409 { 410 int nRet = 0; 411 unsigned int nElements = s_aEmitObjects.size(); 412 for( unsigned i = 0; i < nElements && nRet == 0; i++ ) 413 { 414 sal_Int32 nObject = s_aEmitObjects[i].first; 415 sal_Int32 nGeneration = s_aEmitObjects[i].second; 416 PDFObject* pStream = i_pPDFFile->findObject( nObject, nGeneration ); 417 if( ! pStream ) 418 { 419 fprintf( stderr, "object %d %d not found !\n", (int)nObject, (int)nGeneration ); 420 continue; 421 } 422 423 rtl::OStringBuffer aOutStream( i_pOutFile ); 424 aOutStream.append( "_stream_" ); 425 aOutStream.append( nObject ); 426 aOutStream.append( "_" ); 427 aOutStream.append( nGeneration ); 428 FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile ); 429 aContext.m_bDecrypt = i_pPDFFile->isEncrypted(); 430 pStream->writeStream( aContext, i_pPDFFile ); 431 } 432 return nRet; 433 } 434 435 SAL_IMPLEMENT_MAIN_WITH_ARGS( argc, argv ) 436 { 437 const char* pInFile = NULL; 438 const char* pOutFile = NULL; 439 const char* pPassword = NULL; 440 OStringBuffer aOutFile( 256 ); 441 PDFFileHdl aHdl = write_unzipFile; 442 443 for( int nArg = 1; nArg < argc; nArg++ ) 444 { 445 if( argv[nArg][0] == '-' ) 446 { 447 if( ! rtl_str_compare( "-pw", argv[nArg] ) || 448 ! rtl_str_compare( "--password" , argv[nArg] ) ) 449 { 450 if( nArg == argc-1 ) 451 { 452 fprintf( stderr, "no password given\n" ); 453 return 1; 454 } 455 nArg++; 456 pPassword = argv[nArg]; 457 } 458 else if( ! rtl_str_compare( "-h", argv[nArg] ) || 459 ! rtl_str_compare( "--help", argv[nArg] ) ) 460 { 461 printHelp( argv[0] ); 462 return 0; 463 } 464 else if( ! rtl_str_compare( "-a", argv[nArg] ) || 465 ! rtl_str_compare( "--extract-add-streams", argv[nArg] ) ) 466 { 467 aHdl = write_addStreams; 468 } 469 else if( ! rtl_str_compare( "-f", argv[nArg] ) || 470 ! rtl_str_compare( "--extract-fonts", argv[nArg] ) ) 471 { 472 aHdl = write_fonts; 473 } 474 else if( ! rtl_str_compare( "-o", argv[nArg] ) || 475 ! rtl_str_compare( "--extract-objects", argv[nArg] ) ) 476 { 477 aHdl = write_objects; 478 nArg++; 479 if( nArg < argc ) 480 { 481 rtl::OString aObjs( argv[nArg] ); 482 sal_Int32 nIndex = 0; 483 while( nIndex != -1 ) 484 { 485 rtl::OString aToken( aObjs.getToken( 0, ',', nIndex ) ); 486 sal_Int32 nObject = 0; 487 sal_Int32 nGeneration = 0; 488 sal_Int32 nGenIndex = 0; 489 nObject = aToken.getToken( 0, ':', nGenIndex ).toInt32(); 490 if( nGenIndex != -1 ) 491 nGeneration = aToken.getToken( 0, ':', nGenIndex ).toInt32(); 492 s_aEmitObjects.push_back( std::pair<sal_Int32,sal_Int32>(nObject,nGeneration) ); 493 } 494 } 495 } 496 else 497 { 498 fprintf( stderr, "unrecognized option \"%s\"\n", 499 argv[nArg] ); 500 printHelp( argv[0] ); 501 return 1; 502 } 503 } 504 else if( pInFile == NULL ) 505 pInFile = argv[nArg]; 506 else if( pOutFile == NULL ) 507 pOutFile = argv[nArg]; 508 } 509 if( ! pInFile ) 510 { 511 fprintf( stderr, "no input file given\n" ); 512 return 10; 513 } 514 if( ! pOutFile ) 515 { 516 OString aFile( pInFile ); 517 if( aFile.getLength() > 0 ) 518 { 519 if( aFile.getLength() > 4 ) 520 { 521 if( aFile.matchIgnoreAsciiCase( OString( ".pdf" ), aFile.getLength()-4 ) ) 522 aOutFile.append( pInFile, aFile.getLength() - 4 ); 523 else 524 aOutFile.append( aFile ); 525 } 526 aOutFile.append( "_unzip.pdf" ); 527 pOutFile = aOutFile.getStr(); 528 } 529 else 530 { 531 fprintf( stderr, "no output file given\n" ); 532 return 11; 533 } 534 } 535 536 return handleFile( pInFile, pOutFile, pPassword, aHdl ); 537 } 538 539