sxw/wordsmith/WSDecoder.java

/**************************************************************
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *************************************************************/


package org.openoffice.xmerge.converter.xml.sxw.wordsmith;

import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.FileInputStream;
import java.io.UnsupportedEncodingException;
import org.openoffice.xmerge.util.Debug;

import org.openoffice.xmerge.converter.palm.*;
import org.openoffice.xmerge.util.Resources;

/**
 *  This class is used by {@link
 *  org.openoffice.xmerge.converter.xml.sxw.wordsmith.DocumentDeserializerImpl
 *  DocumentDeserializerImpl} to decode a WordSmith format.  It currently
 *  decodes the text content into a single <code>String</code> object.
 *
 *  @author   Herbie Ong, David Proulx
 */
final class WSDecoder implements DOCConstants {

    /** For decoding purposes. */
    private final static int COUNT_BITS = 3;

    /** Resources object for I18N. */
    private Resources res = null;

    /**
     *  Default constructor creates a header and
     *  a text buffer for holding all the text in
     *  the DOC db.
     */
    WSDecoder() {
        res = Resources.getInstance();
    }

    /**
     *  Decode the text records into a single <code>byte</code> array.
     *
     *  @param  recs  <code>Record</code> array holding WordSmith
     *                  contents.
     *
     *  @throws  IOException  If any I/O error occurs.
     */
    byte[] parseRecords(Record[] recs) throws IOException {

        // read the header record
        HeaderInfo header = readHeader(recs[0].getBytes());
        dumpHeader(header);
        byte[][] byteArrays = new byte[recs.length - 1][];
        for (int i = 0; i < recs.length - 1; i++) byteArrays[i] = null;

        switch (header.version & ~4) {  // DJP: "4" indicates OOB data is present.
        // Add a constant to handle this, might also need code to handle it.

            case COMPRESSED:
            case 3:   // DJP: determined this empirically.  Are Herbie's constants wrong?
                for (int i = 1; i < recs.length; i++) {
                    byteArrays[i-1] = decompress(recs[i].getBytes(),
                    header.textRecordSize);
                    Debug.log(Debug.INFO, "processing " + byteArrays[i-1].length + " bytes");
                }

                break;

            case UNCOMPRESSED:
                for (int i = 1; i < recs.length; i++) {
                    byteArrays[i-1] = recs[i].getBytes();
                    Debug.log(Debug.INFO, "processing " + byteArrays[i-1].length + " bytes");
                }

                break;

            default:
                throw new IOException(res.getString("UNKNOWN_DOC_VERSION"));

        }

        // Concatenate byteArrays[][] into a single byte array.
        int length = 0;
        for (int i = 0; i < recs.length - 1; i++)
            length += byteArrays[i].length;
        byte bigArray[] = new byte[length];
        int offset = 0;
        for (int i = 0; i < recs.length - 1; i++) {
            System.arraycopy(byteArrays[i], 0, bigArray, offset,
            byteArrays[i].length);
            offset += byteArrays[i].length;
        }
        return bigArray;
    }


    /**
     *  Decode the text records into a <code>Wse</code> array.
     *
     *  @param  recs  <code>Record</code> array holding DOC
     *                    contents.
     *
     *  @throws  IOException  If any I/O error occurs.
     */
    Wse[] parseDocument(Record[] recs) throws IOException {

        java.util.Vector v = new java.util.Vector(20, 20);
        WseFontTable fontTable = null;
        WseColorTable colorTable = null;

        // rawData is the document data to be parsed.
        byte rawData[] = parseRecords(recs);

        // beginning of document has some header information, including
        // optional font and color tables.
        // DJP: maybe should add a new WSelement (docHeader) to hold
        // header info.
        // DJP: finish code here to parse header
        if (rawData[0] != 2) throw new IOException();
        int nParagraphs = util.intFrom4bytes(rawData, 2);
        int nAtoms      = util.intFrom4bytes(rawData, 6);
        int nChars      = util.intFrom4bytes(rawData, 10);
        int miscSize    = util.intFrom4bytes(rawData, 14);
        int curIndex = 18;

        while (curIndex < rawData.length) {
            if (WsePara.isValid(rawData, curIndex)) {
                v.add(new WsePara(rawData, curIndex));
                curIndex = WsePara.computeNewIndex(rawData, curIndex);
            } else if (WseTextRun.isValid(rawData, curIndex)) {
                v.add(new WseTextRun(rawData, curIndex, fontTable, colorTable));
                curIndex = WseTextRun.computeNewIndex(rawData, curIndex);
            } else if (WseFontTable.isValid(rawData, curIndex)) {
                fontTable = new WseFontTable(rawData, curIndex);
                v.add(fontTable);
                curIndex = WseFontTable.computeNewIndex(rawData, curIndex);
            } else if (WseColorTable.isValid(rawData, curIndex)) {
                colorTable = new WseColorTable(rawData, curIndex);
                v.add(colorTable);
                curIndex = WseColorTable.computeNewIndex(rawData, curIndex);
            } else {
                Debug.log(Debug.ERROR, "Unknown code " + rawData[curIndex]);
                throw new IOException();
            }
        }

        return (Wse[])v.toArray(new Wse[2]);
    }


    /**
     *  <p>Decompress the <code>byte</code> array.</p>
     *
     *  <p>The resulting uncompressed <code>byte</code> array
     *  should be within <code>textRecordSize</code> length,
     *  definitely within twice the size it claims, else treat
     *  it as a problem with the encoding of that PDB and
     *  throw <code>IOException</code>.</p>
     *
     *  @param  cBytes           Compressed <code>byte</code> array
     *  @param  textRecordSize  Size of uncompressed <code>byte</code>
     *                          array
     *
     *  @throws   IOException  If <code>textRecordSize</codeL &lt;
     *                         <code>cBytes.length</code>.
     */
    private byte[] decompress(byte[] cBytes, int textRecordSize)
    throws IOException {

        // create byte array for storing uncompressed bytes
        // it should be within textRecordSize range, definitely
        // within twice of textRecordSize!  if not, then
        // an ArrayIndexOutOfBoundsException will get thrown,
        // and it should be converted into an IOException, and
        // treat it as a conversion error.
        byte[] uBytes = new byte[textRecordSize*2];

        int up = 0;
        int cp = 0;

        try {

            while (cp < cBytes.length) {

                int c = cBytes[cp++] & 0xff;

                // codes 1...8 mean copy that many bytes
                if (c > 0 && c < 9) {

                    while (c-- > 0)
                        uBytes[up++] = cBytes[cp++];
                }

                // codes 0, 9...0x7F represent themselves
                else if (c < 0x80) {
                    uBytes[up++] = (byte) c;
                }

                // codes 0xC0...0xFF represent "space + ascii char"
                else if (c >= 0xC0) {
                    uBytes[up++] = (byte) ' ';
                    uBytes[up++] = (byte) (c ^ 0x80);
                }

                // codes 0x80...0xBf represent sequences
                else {
                    c <<= 8;
                    c += cBytes[cp++] & 0xff;
                    int m = (c & 0x3fff) >> COUNT_BITS;
                    int n = c & ((1 << COUNT_BITS) - 1);
                    n += COUNT_BITS;
                    while (n-- > 0) {
                        uBytes[up] = uBytes[up - m];
                        up++;
                    }
                }
            }

        } catch (ArrayIndexOutOfBoundsException e) {

            throw new IOException(
            res.getString("DOC_TEXT_RECORD_SIZE_EXCEEDED"));
        }

        // note that ubytes may be larger that the amount of
        // uncompressed bytes, so trim it to another byte array
        // with the exact size.
        byte[] textBytes = new byte[up];
        System.arraycopy(uBytes, 0, textBytes, 0, up);

        return textBytes;
    }


    /**
     *  Read the header <code>byte</code> array.
     *
     *  @param  bytes  <code>byte</code> array containing header
     *                 record data.
     *
     *  @return  <code>HeaderInfo</code> object.
     *
     *  @throws  IOException  If any I/O error occurs.
     */
    private HeaderInfo readHeader(byte[] bytes) throws IOException {

        HeaderInfo header = new HeaderInfo();

        ByteArrayInputStream bis = new ByteArrayInputStream(bytes);
        DataInputStream dis = new DataInputStream(bis);

        // Normally the first 2 bytes comprised of the version
        // which should either be COMPRESSED or UNCOMPRESSED
        // SmartDoc/Quickword would add a 0x01 to the first
        // byte, thus their version would be 0x0101 for UNCOMPRESSED
        // instead of 0x0001 and 0x0102 for UNCOMPRESSED instead of
        // 0x0002.

        dis.readByte();
        header.version = dis.readByte();

        // read extra 2 unused bytes
        dis.readShort();

        // Read the text length, this should be unsigned 4 bytes.
        // We could store the read value into a long, but then
        // our current buffer limit is the max positive of an int.
        // That is a large enough limit, thus we shall stay with
        // storing the value in an int.  If it exceeds, then
        // an IOException should be thrown.
        header.textLen = dis.readInt();
        if (header.textLen < 0) {
            throw new IOException(res.getString("DOC_TEXT_LENGTH_EXCEEDED"));
        }

        // read the number of records - unsigned 2 bytes
        header.textRecordCount = ((int) dis.readShort()) & 0x0000ffff;

        // read the record size - unsigned 2 bytes
        header.textRecordSize = ((int) dis.readShort()) & 0x0000ffff;

        // read extra 4 unused bytes
        dis.readInt();

        return header;
    }


    /**
     *  Prints out header info into log.
     *  Used for debugging purposes only.
     *
     *  @param  header  <code>HeaderInfo</code> structure.
     */
    private void dumpHeader(HeaderInfo header) {
    /*
        log("<DOC_INFO ");
        log("version=\"" + header.version + "\" ");
        log("text-length=\"" + header.textLen + "\" ");
        log("number-of-records=\"" + header.textRecordCount + "\" ");
        log("record-size=\"" + header.textRecordSize  + "\" />\n");
    */
    }


    /**
     *  Inner class to store DOC header information.
     */
    private class HeaderInfo {

        /** length of text section */
        int textLen = 0;

        /** number of text records */
        int textRecordCount = 0;

        /**
         *  size of a text record.  This is normally the same as
         *  TEXT_RECORD_SIZE, but some applications may modify this.
         */
        int textRecordSize = 0;

        /** compression type */
        int version = 0;
    }
}