1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 package org.openoffice.xmerge.converter.xml.sxw.wordsmith; 25 26 import java.io.ByteArrayInputStream; 27 import java.io.DataInputStream; 28 import java.io.IOException; 29 import java.io.FileInputStream; 30 import java.io.UnsupportedEncodingException; 31 import org.openoffice.xmerge.util.Debug; 32 33 import org.openoffice.xmerge.converter.palm.*; 34 import org.openoffice.xmerge.util.Resources; 35 36 /** 37 * This class is used by {@link 38 * org.openoffice.xmerge.converter.xml.sxw.wordsmith.DocumentDeserializerImpl 39 * DocumentDeserializerImpl} to decode a WordSmith format. It currently 40 * decodes the text content into a single <code>String</code> object. 41 * 42 * @author Herbie Ong, David Proulx 43 */ 44 final class WSDecoder implements DOCConstants { 45 46 /** For decoding purposes. */ 47 private final static int COUNT_BITS = 3; 48 49 /** Resources object for I18N. */ 50 private Resources res = null; 51 52 /** 53 * Default constructor creates a header and 54 * a text buffer for holding all the text in 55 * the DOC db. 56 */ WSDecoder()57 WSDecoder() { 58 res = Resources.getInstance(); 59 } 60 61 /** 62 * Decode the text records into a single <code>byte</code> array. 63 * 64 * @param recs <code>Record</code> array holding WordSmith 65 * contents. 66 * 67 * @throws IOException If any I/O error occurs. 68 */ parseRecords(Record[] recs)69 byte[] parseRecords(Record[] recs) throws IOException { 70 71 // read the header record 72 HeaderInfo header = readHeader(recs[0].getBytes()); 73 dumpHeader(header); 74 byte[][] byteArrays = new byte[recs.length - 1][]; 75 for (int i = 0; i < recs.length - 1; i++) byteArrays[i] = null; 76 77 switch (header.version & ~4) { // DJP: "4" indicates OOB data is present. 78 // Add a constant to handle this, might also need code to handle it. 79 80 case COMPRESSED: 81 case 3: // DJP: determined this empirically. Are Herbie's constants wrong? 82 for (int i = 1; i < recs.length; i++) { 83 byteArrays[i-1] = decompress(recs[i].getBytes(), 84 header.textRecordSize); 85 Debug.log(Debug.INFO, "processing " + byteArrays[i-1].length + " bytes"); 86 } 87 88 break; 89 90 case UNCOMPRESSED: 91 for (int i = 1; i < recs.length; i++) { 92 byteArrays[i-1] = recs[i].getBytes(); 93 Debug.log(Debug.INFO, "processing " + byteArrays[i-1].length + " bytes"); 94 } 95 96 break; 97 98 default: 99 throw new IOException(res.getString("UNKNOWN_DOC_VERSION")); 100 101 } 102 103 // Concatenate byteArrays[][] into a single byte array. 104 int length = 0; 105 for (int i = 0; i < recs.length - 1; i++) 106 length += byteArrays[i].length; 107 byte bigArray[] = new byte[length]; 108 int offset = 0; 109 for (int i = 0; i < recs.length - 1; i++) { 110 System.arraycopy(byteArrays[i], 0, bigArray, offset, 111 byteArrays[i].length); 112 offset += byteArrays[i].length; 113 } 114 return bigArray; 115 } 116 117 118 /** 119 * Decode the text records into a <code>Wse</code> array. 120 * 121 * @param recs <code>Record</code> array holding DOC 122 * contents. 123 * 124 * @throws IOException If any I/O error occurs. 125 */ parseDocument(Record[] recs)126 Wse[] parseDocument(Record[] recs) throws IOException { 127 128 java.util.Vector v = new java.util.Vector(20, 20); 129 WseFontTable fontTable = null; 130 WseColorTable colorTable = null; 131 132 // rawData is the document data to be parsed. 133 byte rawData[] = parseRecords(recs); 134 135 // beginning of document has some header information, including 136 // optional font and color tables. 137 // DJP: maybe should add a new WSelement (docHeader) to hold 138 // header info. 139 // DJP: finish code here to parse header 140 if (rawData[0] != 2) throw new IOException(); 141 int nParagraphs = util.intFrom4bytes(rawData, 2); 142 int nAtoms = util.intFrom4bytes(rawData, 6); 143 int nChars = util.intFrom4bytes(rawData, 10); 144 int miscSize = util.intFrom4bytes(rawData, 14); 145 int curIndex = 18; 146 147 while (curIndex < rawData.length) { 148 if (WsePara.isValid(rawData, curIndex)) { 149 v.add(new WsePara(rawData, curIndex)); 150 curIndex = WsePara.computeNewIndex(rawData, curIndex); 151 } else if (WseTextRun.isValid(rawData, curIndex)) { 152 v.add(new WseTextRun(rawData, curIndex, fontTable, colorTable)); 153 curIndex = WseTextRun.computeNewIndex(rawData, curIndex); 154 } else if (WseFontTable.isValid(rawData, curIndex)) { 155 fontTable = new WseFontTable(rawData, curIndex); 156 v.add(fontTable); 157 curIndex = WseFontTable.computeNewIndex(rawData, curIndex); 158 } else if (WseColorTable.isValid(rawData, curIndex)) { 159 colorTable = new WseColorTable(rawData, curIndex); 160 v.add(colorTable); 161 curIndex = WseColorTable.computeNewIndex(rawData, curIndex); 162 } else { 163 Debug.log(Debug.ERROR, "Unknown code " + rawData[curIndex]); 164 throw new IOException(); 165 } 166 } 167 168 return (Wse[])v.toArray(new Wse[2]); 169 } 170 171 172 /** 173 * <p>Decompress the <code>byte</code> array.</p> 174 * 175 * <p>The resulting uncompressed <code>byte</code> array 176 * should be within <code>textRecordSize</code> length, 177 * definitely within twice the size it claims, else treat 178 * it as a problem with the encoding of that PDB and 179 * throw <code>IOException</code>.</p> 180 * 181 * @param cBytes Compressed <code>byte</code> array 182 * @param textRecordSize Size of uncompressed <code>byte</code> 183 * array 184 * 185 * @throws IOException If <code>textRecordSize</codeL < 186 * <code>cBytes.length</code>. 187 */ decompress(byte[] cBytes, int textRecordSize)188 private byte[] decompress(byte[] cBytes, int textRecordSize) 189 throws IOException { 190 191 // create byte array for storing uncompressed bytes 192 // it should be within textRecordSize range, definitely 193 // within twice of textRecordSize! if not, then 194 // an ArrayIndexOutOfBoundsException will get thrown, 195 // and it should be converted into an IOException, and 196 // treat it as a conversion error. 197 byte[] uBytes = new byte[textRecordSize*2]; 198 199 int up = 0; 200 int cp = 0; 201 202 try { 203 204 while (cp < cBytes.length) { 205 206 int c = cBytes[cp++] & 0xff; 207 208 // codes 1...8 mean copy that many bytes 209 if (c > 0 && c < 9) { 210 211 while (c-- > 0) 212 uBytes[up++] = cBytes[cp++]; 213 } 214 215 // codes 0, 9...0x7F represent themselves 216 else if (c < 0x80) { 217 uBytes[up++] = (byte) c; 218 } 219 220 // codes 0xC0...0xFF represent "space + ascii char" 221 else if (c >= 0xC0) { 222 uBytes[up++] = (byte) ' '; 223 uBytes[up++] = (byte) (c ^ 0x80); 224 } 225 226 // codes 0x80...0xBf represent sequences 227 else { 228 c <<= 8; 229 c += cBytes[cp++] & 0xff; 230 int m = (c & 0x3fff) >> COUNT_BITS; 231 int n = c & ((1 << COUNT_BITS) - 1); 232 n += COUNT_BITS; 233 while (n-- > 0) { 234 uBytes[up] = uBytes[up - m]; 235 up++; 236 } 237 } 238 } 239 240 } catch (ArrayIndexOutOfBoundsException e) { 241 242 throw new IOException( 243 res.getString("DOC_TEXT_RECORD_SIZE_EXCEEDED")); 244 } 245 246 // note that ubytes may be larger that the amount of 247 // uncompressed bytes, so trim it to another byte array 248 // with the exact size. 249 byte[] textBytes = new byte[up]; 250 System.arraycopy(uBytes, 0, textBytes, 0, up); 251 252 return textBytes; 253 } 254 255 256 /** 257 * Read the header <code>byte</code> array. 258 * 259 * @param bytes <code>byte</code> array containing header 260 * record data. 261 * 262 * @return <code>HeaderInfo</code> object. 263 * 264 * @throws IOException If any I/O error occurs. 265 */ readHeader(byte[] bytes)266 private HeaderInfo readHeader(byte[] bytes) throws IOException { 267 268 HeaderInfo header = new HeaderInfo(); 269 270 ByteArrayInputStream bis = new ByteArrayInputStream(bytes); 271 DataInputStream dis = new DataInputStream(bis); 272 273 // Normally the first 2 bytes comprised of the version 274 // which should either be COMPRESSED or UNCOMPRESSED 275 // SmartDoc/Quickword would add a 0x01 to the first 276 // byte, thus their version would be 0x0101 for UNCOMPRESSED 277 // instead of 0x0001 and 0x0102 for UNCOMPRESSED instead of 278 // 0x0002. 279 280 dis.readByte(); 281 header.version = dis.readByte(); 282 283 // read extra 2 unused bytes 284 dis.readShort(); 285 286 // Read the text length, this should be unsigned 4 bytes. 287 // We could store the read value into a long, but then 288 // our current buffer limit is the max positive of an int. 289 // That is a large enough limit, thus we shall stay with 290 // storing the value in an int. If it exceeds, then 291 // an IOException should be thrown. 292 header.textLen = dis.readInt(); 293 if (header.textLen < 0) { 294 throw new IOException(res.getString("DOC_TEXT_LENGTH_EXCEEDED")); 295 } 296 297 // read the number of records - unsigned 2 bytes 298 header.textRecordCount = ((int) dis.readShort()) & 0x0000ffff; 299 300 // read the record size - unsigned 2 bytes 301 header.textRecordSize = ((int) dis.readShort()) & 0x0000ffff; 302 303 // read extra 4 unused bytes 304 dis.readInt(); 305 306 return header; 307 } 308 309 310 /** 311 * Prints out header info into log. 312 * Used for debugging purposes only. 313 * 314 * @param header <code>HeaderInfo</code> structure. 315 */ dumpHeader(HeaderInfo header)316 private void dumpHeader(HeaderInfo header) { 317 /* 318 log("<DOC_INFO "); 319 log("version=\"" + header.version + "\" "); 320 log("text-length=\"" + header.textLen + "\" "); 321 log("number-of-records=\"" + header.textRecordCount + "\" "); 322 log("record-size=\"" + header.textRecordSize + "\" />\n"); 323 */ 324 } 325 326 327 /** 328 * Inner class to store DOC header information. 329 */ 330 private class HeaderInfo { 331 332 /** length of text section */ 333 int textLen = 0; 334 335 /** number of text records */ 336 int textRecordCount = 0; 337 338 /** 339 * size of a text record. This is normally the same as 340 * TEXT_RECORD_SIZE, but some applications may modify this. 341 */ 342 int textRecordSize = 0; 343 344 /** compression type */ 345 int version = 0; 346 } 347 } 348 349