1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 package org.openoffice.xmerge.converter.xml.sxw.wordsmith;
25 
26 import java.io.ByteArrayInputStream;
27 import java.io.DataInputStream;
28 import java.io.IOException;
29 import java.io.FileInputStream;
30 import java.io.UnsupportedEncodingException;
31 import org.openoffice.xmerge.util.Debug;
32 
33 import org.openoffice.xmerge.converter.palm.*;
34 import org.openoffice.xmerge.util.Resources;
35 
36 /**
37  *  This class is used by {@link
38  *  org.openoffice.xmerge.converter.xml.sxw.wordsmith.DocumentDeserializerImpl
39  *  DocumentDeserializerImpl} to decode a WordSmith format.  It currently
40  *  decodes the text content into a single <code>String</code> object.
41  *
42  *  @author   Herbie Ong, David Proulx
43  */
44 final class WSDecoder implements DOCConstants {
45 
46     /** For decoding purposes. */
47     private final static int COUNT_BITS = 3;
48 
49     /** Resources object for I18N. */
50     private Resources res = null;
51 
52     /**
53      *  Default constructor creates a header and
54      *  a text buffer for holding all the text in
55      *  the DOC db.
56      */
WSDecoder()57     WSDecoder() {
58         res = Resources.getInstance();
59     }
60 
61     /**
62      *  Decode the text records into a single <code>byte</code> array.
63      *
64      *  @param  recs  <code>Record</code> array holding WordSmith
65      *                  contents.
66      *
67      *  @throws  IOException  If any I/O error occurs.
68      */
parseRecords(Record[] recs)69     byte[] parseRecords(Record[] recs) throws IOException {
70 
71         // read the header record
72         HeaderInfo header = readHeader(recs[0].getBytes());
73         dumpHeader(header);
74         byte[][] byteArrays = new byte[recs.length - 1][];
75         for (int i = 0; i < recs.length - 1; i++) byteArrays[i] = null;
76 
77         switch (header.version & ~4) {  // DJP: "4" indicates OOB data is present.
78         // Add a constant to handle this, might also need code to handle it.
79 
80             case COMPRESSED:
81             case 3:   // DJP: determined this empirically.  Are Herbie's constants wrong?
82                 for (int i = 1; i < recs.length; i++) {
83                     byteArrays[i-1] = decompress(recs[i].getBytes(),
84                     header.textRecordSize);
85                     Debug.log(Debug.INFO, "processing " + byteArrays[i-1].length + " bytes");
86                 }
87 
88                 break;
89 
90             case UNCOMPRESSED:
91                 for (int i = 1; i < recs.length; i++) {
92                     byteArrays[i-1] = recs[i].getBytes();
93                     Debug.log(Debug.INFO, "processing " + byteArrays[i-1].length + " bytes");
94                 }
95 
96                 break;
97 
98             default:
99                 throw new IOException(res.getString("UNKNOWN_DOC_VERSION"));
100 
101         }
102 
103         // Concatenate byteArrays[][] into a single byte array.
104         int length = 0;
105         for (int i = 0; i < recs.length - 1; i++)
106             length += byteArrays[i].length;
107         byte bigArray[] = new byte[length];
108         int offset = 0;
109         for (int i = 0; i < recs.length - 1; i++) {
110             System.arraycopy(byteArrays[i], 0, bigArray, offset,
111             byteArrays[i].length);
112             offset += byteArrays[i].length;
113         }
114         return bigArray;
115     }
116 
117 
118     /**
119      *  Decode the text records into a <code>Wse</code> array.
120      *
121      *  @param  recs  <code>Record</code> array holding DOC
122      *                    contents.
123      *
124      *  @throws  IOException  If any I/O error occurs.
125      */
parseDocument(Record[] recs)126     Wse[] parseDocument(Record[] recs) throws IOException {
127 
128         java.util.Vector v = new java.util.Vector(20, 20);
129         WseFontTable fontTable = null;
130         WseColorTable colorTable = null;
131 
132         // rawData is the document data to be parsed.
133         byte rawData[] = parseRecords(recs);
134 
135         // beginning of document has some header information, including
136         // optional font and color tables.
137         // DJP: maybe should add a new WSelement (docHeader) to hold
138         // header info.
139         // DJP: finish code here to parse header
140         if (rawData[0] != 2) throw new IOException();
141         int nParagraphs = util.intFrom4bytes(rawData, 2);
142         int nAtoms      = util.intFrom4bytes(rawData, 6);
143         int nChars      = util.intFrom4bytes(rawData, 10);
144         int miscSize    = util.intFrom4bytes(rawData, 14);
145         int curIndex = 18;
146 
147         while (curIndex < rawData.length) {
148             if (WsePara.isValid(rawData, curIndex)) {
149                 v.add(new WsePara(rawData, curIndex));
150                 curIndex = WsePara.computeNewIndex(rawData, curIndex);
151             } else if (WseTextRun.isValid(rawData, curIndex)) {
152                 v.add(new WseTextRun(rawData, curIndex, fontTable, colorTable));
153                 curIndex = WseTextRun.computeNewIndex(rawData, curIndex);
154             } else if (WseFontTable.isValid(rawData, curIndex)) {
155                 fontTable = new WseFontTable(rawData, curIndex);
156                 v.add(fontTable);
157                 curIndex = WseFontTable.computeNewIndex(rawData, curIndex);
158             } else if (WseColorTable.isValid(rawData, curIndex)) {
159                 colorTable = new WseColorTable(rawData, curIndex);
160                 v.add(colorTable);
161                 curIndex = WseColorTable.computeNewIndex(rawData, curIndex);
162             } else {
163                 Debug.log(Debug.ERROR, "Unknown code " + rawData[curIndex]);
164                 throw new IOException();
165             }
166         }
167 
168         return (Wse[])v.toArray(new Wse[2]);
169     }
170 
171 
172     /**
173      *  <p>Decompress the <code>byte</code> array.</p>
174      *
175      *  <p>The resulting uncompressed <code>byte</code> array
176      *  should be within <code>textRecordSize</code> length,
177      *  definitely within twice the size it claims, else treat
178      *  it as a problem with the encoding of that PDB and
179      *  throw <code>IOException</code>.</p>
180      *
181      *  @param  cBytes           Compressed <code>byte</code> array
182      *  @param  textRecordSize  Size of uncompressed <code>byte</code>
183      *                          array
184      *
185      *  @throws   IOException  If <code>textRecordSize</codeL &lt;
186      *                         <code>cBytes.length</code>.
187      */
decompress(byte[] cBytes, int textRecordSize)188     private byte[] decompress(byte[] cBytes, int textRecordSize)
189     throws IOException {
190 
191         // create byte array for storing uncompressed bytes
192         // it should be within textRecordSize range, definitely
193         // within twice of textRecordSize!  if not, then
194         // an ArrayIndexOutOfBoundsException will get thrown,
195         // and it should be converted into an IOException, and
196         // treat it as a conversion error.
197         byte[] uBytes = new byte[textRecordSize*2];
198 
199         int up = 0;
200         int cp = 0;
201 
202         try {
203 
204             while (cp < cBytes.length) {
205 
206                 int c = cBytes[cp++] & 0xff;
207 
208                 // codes 1...8 mean copy that many bytes
209                 if (c > 0 && c < 9) {
210 
211                     while (c-- > 0)
212                         uBytes[up++] = cBytes[cp++];
213                 }
214 
215                 // codes 0, 9...0x7F represent themselves
216                 else if (c < 0x80) {
217                     uBytes[up++] = (byte) c;
218                 }
219 
220                 // codes 0xC0...0xFF represent "space + ascii char"
221                 else if (c >= 0xC0) {
222                     uBytes[up++] = (byte) ' ';
223                     uBytes[up++] = (byte) (c ^ 0x80);
224                 }
225 
226                 // codes 0x80...0xBf represent sequences
227                 else {
228                     c <<= 8;
229                     c += cBytes[cp++] & 0xff;
230                     int m = (c & 0x3fff) >> COUNT_BITS;
231                     int n = c & ((1 << COUNT_BITS) - 1);
232                     n += COUNT_BITS;
233                     while (n-- > 0) {
234                         uBytes[up] = uBytes[up - m];
235                         up++;
236                     }
237                 }
238             }
239 
240         } catch (ArrayIndexOutOfBoundsException e) {
241 
242             throw new IOException(
243             res.getString("DOC_TEXT_RECORD_SIZE_EXCEEDED"));
244         }
245 
246         // note that ubytes may be larger that the amount of
247         // uncompressed bytes, so trim it to another byte array
248         // with the exact size.
249         byte[] textBytes = new byte[up];
250         System.arraycopy(uBytes, 0, textBytes, 0, up);
251 
252         return textBytes;
253     }
254 
255 
256     /**
257      *  Read the header <code>byte</code> array.
258      *
259      *  @param  bytes  <code>byte</code> array containing header
260      *                 record data.
261      *
262      *  @return  <code>HeaderInfo</code> object.
263      *
264      *  @throws  IOException  If any I/O error occurs.
265      */
readHeader(byte[] bytes)266     private HeaderInfo readHeader(byte[] bytes) throws IOException {
267 
268         HeaderInfo header = new HeaderInfo();
269 
270         ByteArrayInputStream bis = new ByteArrayInputStream(bytes);
271         DataInputStream dis = new DataInputStream(bis);
272 
273         // Normally the first 2 bytes comprised of the version
274         // which should either be COMPRESSED or UNCOMPRESSED
275         // SmartDoc/Quickword would add a 0x01 to the first
276         // byte, thus their version would be 0x0101 for UNCOMPRESSED
277         // instead of 0x0001 and 0x0102 for UNCOMPRESSED instead of
278         // 0x0002.
279 
280         dis.readByte();
281         header.version = dis.readByte();
282 
283         // read extra 2 unused bytes
284         dis.readShort();
285 
286         // Read the text length, this should be unsigned 4 bytes.
287         // We could store the read value into a long, but then
288         // our current buffer limit is the max positive of an int.
289         // That is a large enough limit, thus we shall stay with
290         // storing the value in an int.  If it exceeds, then
291         // an IOException should be thrown.
292         header.textLen = dis.readInt();
293         if (header.textLen < 0) {
294             throw new IOException(res.getString("DOC_TEXT_LENGTH_EXCEEDED"));
295         }
296 
297         // read the number of records - unsigned 2 bytes
298         header.textRecordCount = ((int) dis.readShort()) & 0x0000ffff;
299 
300         // read the record size - unsigned 2 bytes
301         header.textRecordSize = ((int) dis.readShort()) & 0x0000ffff;
302 
303         // read extra 4 unused bytes
304         dis.readInt();
305 
306         return header;
307     }
308 
309 
310     /**
311      *  Prints out header info into log.
312      *  Used for debugging purposes only.
313      *
314      *  @param  header  <code>HeaderInfo</code> structure.
315      */
dumpHeader(HeaderInfo header)316     private void dumpHeader(HeaderInfo header) {
317     /*
318         log("<DOC_INFO ");
319         log("version=\"" + header.version + "\" ");
320         log("text-length=\"" + header.textLen + "\" ");
321         log("number-of-records=\"" + header.textRecordCount + "\" ");
322         log("record-size=\"" + header.textRecordSize  + "\" />\n");
323     */
324     }
325 
326 
327     /**
328      *  Inner class to store DOC header information.
329      */
330     private class HeaderInfo {
331 
332         /** length of text section */
333         int textLen = 0;
334 
335         /** number of text records */
336         int textRecordCount = 0;
337 
338         /**
339          *  size of a text record.  This is normally the same as
340          *  TEXT_RECORD_SIZE, but some applications may modify this.
341          */
342         int textRecordSize = 0;
343 
344         /** compression type */
345         int version = 0;
346     }
347 }
348 
349