1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 package org.openoffice.xmerge.converter.xml.sxw.wordsmith; 25 26 import org.w3c.dom.*; 27 28 import java.io.IOException; 29 import java.util.Enumeration; 30 31 import org.openoffice.xmerge.Document; 32 import org.openoffice.xmerge.ConvertData; 33 import org.openoffice.xmerge.ConvertException; 34 import org.openoffice.xmerge.DocumentDeserializer; 35 import org.openoffice.xmerge.converter.xml.OfficeConstants; 36 import org.openoffice.xmerge.converter.palm.PalmDB; 37 import org.openoffice.xmerge.converter.palm.Record; 38 import org.openoffice.xmerge.converter.palm.PdbDecoder; 39 import org.openoffice.xmerge.converter.palm.PalmDocument; 40 import org.openoffice.xmerge.converter.xml.sxw.SxwDocument; 41 42 import java.util.Vector; 43 import java.io.ByteArrayInputStream; 44 45 import org.openoffice.xmerge.converter.xml.*; 46 import org.openoffice.xmerge.util.Debug; 47 import org.openoffice.xmerge.util.XmlUtil; 48 49 /** 50 * <p>WordSmith implementation of 51 * org.openoffice.xmerge.DocumentDeserializer 52 * for the {@link 53 * org.openoffice.xmerge.converter.xml.sxw.wordsmith.PluginFactoryImpl 54 * PluginFactoryImpl}.</p> 55 * 56 * The <code>deserialize</code> method uses a 57 * <code>DocDecoder</code> to read the WordSmith format into a 58 * <code>String</code> object, then it calls <code>buildDocument</code> 59 * to create a <code>SxwDocument</code> object from it. 60 * 61 * @author Herbie Ong, David Proulx 62 */ 63 public final class DocumentDeserializerImpl 64 implements DOCConstants, OfficeConstants, DocumentDeserializer { 65 66 /** A Decoder object for decoding WordSmith format. */ 67 private WSDecoder decoder = null; 68 69 WseFontTable fontTable = null; 70 WseColorTable colorTable = null; 71 StyleCatalog styleCat = null; 72 StyleCatalog oldStyleCat = null; 73 74 /** A <code>ConvertData</code> object assigned to this object. */ 75 private ConvertData cd = null; 76 77 78 /** 79 * Constructor that assigns the given <code>ConvertData</code> 80 * to the object. 81 * 82 * @param cd A <code>ConvertData</code> object to read data for 83 * the conversion process by the deserialize method. 84 */ DocumentDeserializerImpl(ConvertData cd)85 public DocumentDeserializerImpl(ConvertData cd) { 86 this.cd = cd; 87 } 88 89 90 /** 91 * Convert the given <code>ConvertData</code> into a 92 * <code>SxwDocument</code> object. 93 * 94 * @return Resulting <code>Document</code> object. 95 * 96 * @throws ConvertException If any conversion error occurs. 97 * @throws IOException If any I/O error occurs. 98 */ deserialize()99 public Document deserialize() throws ConvertException, 100 IOException { 101 return deserialize(null, cd); 102 } 103 104 deserialize(Document origDoc, ConvertData cd)105 public Document deserialize(Document origDoc, ConvertData cd) 106 throws IOException { 107 108 Document doc = null; 109 PalmDocument palmDoc = null; 110 Enumeration e = cd.getDocumentEnumeration(); 111 112 while(e.hasMoreElements()) { 113 palmDoc = (PalmDocument) e.nextElement(); 114 PalmDB pdb = palmDoc.getPdb(); 115 Record[] recs = pdb.getRecords(); 116 decoder = new WSDecoder(); 117 Wse[] b = decoder.parseDocument(recs); 118 String docName = palmDoc.getName(); 119 doc = buildDocument(docName, b, origDoc); 120 } 121 return doc; 122 } 123 124 125 /** 126 * Temporary method to read existing <code>StyleCatalog</code> 127 * as a starting point. 128 * 129 * @param parentDoc The parent <code>Document</code>. 130 */ readStyleCatalog(Document parentDoc)131 private void readStyleCatalog(Document parentDoc) { 132 Element rootNode = null; 133 try { 134 java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream(); 135 parentDoc.write(bos); 136 SxwDocument sxwDoc = new SxwDocument("old"); 137 sxwDoc.read(new ByteArrayInputStream(bos.toByteArray())); 138 org.w3c.dom.Document domDoc = sxwDoc.getContentDOM(); 139 140 String families[] = new String[3]; 141 families[0] = "text"; 142 families[1] = "paragraph"; 143 families[2] = "paragraph"; 144 Class classes[] = new Class[3]; 145 classes[0] = TextStyle.class; 146 classes[1] = ParaStyle.class; 147 classes[2] = TextStyle.class; 148 149 NodeList nl = domDoc.getElementsByTagName(TAG_OFFICE_STYLES); 150 oldStyleCat.add(nl.item(0), families, classes, null, false); 151 nl = domDoc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES); 152 oldStyleCat.add(nl.item(0), families, classes, null, false); 153 nl = domDoc.getElementsByTagName(TAG_OFFICE_MASTER_STYLES); 154 oldStyleCat.add(nl.item(0), families, classes, null, false); 155 156 } catch (Exception e) { 157 Debug.log(Debug.ERROR, "", e); 158 } 159 160 } 161 162 163 /** 164 * Given an array of paragraph <code>Style</code> objects, see if 165 * there is exactly one which matches the text formatting 166 * <code>Style</code> of <code>tStyle</code>. 167 * 168 * @param paraStyles An array of paragraph <code>Style</code> 169 * objects. 170 * @param tStyle Text <code>Style</code> to match. 171 * 172 * @return The paragraph <code>Style</code> that matches. 173 */ matchParaByText(Style paraStyles[], TextStyle tStyle)174 private ParaStyle matchParaByText(Style paraStyles[], TextStyle tStyle) { 175 int matchIndex = -1; 176 int matchCount = 0; 177 Style txtMatches[] = (Style[]) oldStyleCat.getMatching(tStyle); 178 if (txtMatches.length >= 1) { 179 for (int j = 0; j < txtMatches.length; j++) { 180 TextStyle t = (TextStyle)txtMatches[j]; 181 182 if (!t.getFamily().equals("paragraph")) 183 continue; 184 185 for (int k = 0; k < paraStyles.length; k++) { 186 if (t.getName().equals(paraStyles[k].getName())) { 187 matchCount++; 188 matchIndex = k; 189 } 190 } 191 } 192 } 193 if (matchCount == 1) 194 return (ParaStyle)paraStyles[matchIndex]; 195 else return null; 196 } 197 198 199 /** 200 * Take a <code>String</code> of text and turn it into a sequence 201 * of <code>Node</code> objects. 202 * 203 * @param text <code>String</code> of text. 204 * @param parentDoc Parent <code>Document</code>. 205 * 206 * @return Array of <code>Node</code> objects. 207 */ parseText(String text, org.w3c.dom.Document parentDoc)208 private Node[] parseText(String text, org.w3c.dom.Document parentDoc) { 209 Vector nodeVec = new Vector(); 210 211 // Break up the text from the WordSmith text run into Open 212 // Office text runs. There may be more runs in OO because 213 // runs of 2 or more spaces map to nodes. 214 while ((text.indexOf(" ") != -1) || (text.indexOf("\t") != 1)) { 215 216 // Find the indices of tabs and multiple spaces, and 217 // figure out which of them occurs first in the string. 218 int spaceIndex = text.indexOf(" "); 219 int tabIndex = text.indexOf("\t"); 220 if ((spaceIndex == -1) && (tabIndex == -1)) 221 break; // DJP This should not be necessary. What is wrong 222 // with the while() stmt up above? 223 int closerIndex; // Index of the first of these 224 if (spaceIndex == -1) 225 closerIndex = tabIndex; 226 else if (tabIndex == -1) 227 closerIndex = spaceIndex; 228 else 229 closerIndex = (spaceIndex > tabIndex) ? tabIndex : spaceIndex; 230 231 // If there is any text prior to the first occurrence of a 232 // tab or spaces, create a text node from it, then chop it 233 // off the string we're working with. 234 if (closerIndex > 0) { 235 String beginningText = text.substring(0, closerIndex); 236 Text textNode = parentDoc.createTextNode(beginningText); 237 nodeVec.addElement(textNode); 238 log("<TEXT>"); 239 log(beginningText); 240 log("</TEXT>"); 241 } 242 text = text.substring(closerIndex); 243 244 // Handle either tab character or space sequence by creating 245 // an element for it, and then chopping out the text that 246 // represented it in "text". 247 if (closerIndex == tabIndex) { 248 Element tabNode = parentDoc.createElement(TAG_TAB_STOP); 249 nodeVec.add(tabNode); 250 text = text.substring(1); // tab is always a single character 251 log("<TAB/>"); 252 } else { 253 // Compute length of space sequence. 254 int nrSpaces = 2; 255 while ((nrSpaces < text.length()) 256 && text.substring(nrSpaces, nrSpaces + 1).equals(" ")) 257 nrSpaces++; 258 259 Element spaceNode = parentDoc.createElement(TAG_SPACE); 260 spaceNode.setAttribute(ATTRIBUTE_SPACE_COUNT, new Integer(nrSpaces).toString()); 261 nodeVec.add(spaceNode); 262 text = text.substring(nrSpaces); 263 log("<SPACE count=\"" + nrSpaces + "\" />"); 264 } 265 } 266 267 // No more tabs or space sequences. If there's any remaining 268 // text create a text node for it. 269 if (text.length() > 0) { 270 Text textNode = parentDoc.createTextNode(text); 271 nodeVec.add(textNode); 272 log("<TEXT>"); 273 log(text); 274 log("</TEXT>"); 275 } 276 277 // Now create and populate an array to return the nodes in. 278 Node nodes[] = new Node[nodeVec.size()]; 279 for (int i = 0; i < nodeVec.size(); i++) 280 nodes[i] = (Node)nodeVec.elementAt(i); 281 return nodes; 282 } 283 284 285 /** 286 * Parses the text content of a WordSmith format and builds a 287 * <code>SXWDocument</code>. 288 * 289 * @param docName <code>Document</code> name 290 * @param data Text content of WordSmith format 291 * 292 * @return Resulting <code>SXWDocument</code> object. 293 * 294 * @throws IOException If any I/O error occurs. 295 */ buildDocument(String docName, Wse[] data, Document origDoc)296 private SxwDocument buildDocument(String docName, Wse[] data, Document origDoc) 297 throws IOException { 298 299 // create minimum office xml document. 300 SxwDocument sxwDoc = new SxwDocument(docName); 301 sxwDoc.initContentDOM(); 302 303 org.w3c.dom.Document doc = sxwDoc.getContentDOM(); 304 305 // Grab hold of the office:body tag, 306 // Assume there should be one. 307 // This is where top level paragraphs will append to. 308 NodeList list = doc.getElementsByTagName(TAG_OFFICE_BODY); 309 Node bodyNode = list.item(0); 310 311 styleCat = new StyleCatalog(50); 312 oldStyleCat = new StyleCatalog(50); 313 if (origDoc != null) 314 readStyleCatalog(origDoc); 315 316 Element currPara = null; 317 ParaStyle currParaStyle = null; 318 int newTextStyleNr = 0; 319 int newParaStyleNr = 0; 320 321 // Now write out the document body by running through 322 // the list of WordSmith elements and processing each one 323 // in turn. 324 for (int i = 0; i < data.length; i++) { 325 326 if (data[i].getClass() == WsePara.class) { 327 328 currPara = doc.createElement(TAG_PARAGRAPH); 329 log("</PARA>"); 330 log("<PARA>"); 331 332 WsePara p = (WsePara)data[i]; 333 334 // Save info about the first text run, if there is one. 335 WseTextRun firstTextRun = null; 336 337 if ((data.length >= i + 2) 338 && (data[i+1].getClass() == WseTextRun.class)) 339 firstTextRun = (WseTextRun)data[i+1]; 340 341 Style matches[] = oldStyleCat.getMatching(p.makeStyle()); 342 343 // See if we can find a unique match in the catalog 344 // of existing styles from the original document. 345 ParaStyle pStyle = null; 346 if (matches.length == 1) { 347 pStyle = (ParaStyle)matches[0]; 348 log("using an existing style"); 349 } else if ((matches.length > 1) && (firstTextRun != null)) { 350 pStyle = matchParaByText(matches, firstTextRun.makeStyle()); 351 log("resolved a para by looking @ text"); 352 } 353 354 // If nothing found so far, try looking in the catalog 355 // of newly-created styles. 356 // DJP FIXME: if we need to add two para styles with the 357 // same para formatting info but different default text 358 // styles, this won't work! 359 if (pStyle == null) { 360 log("had " + matches.length + " matches in old catalog"); 361 matches = styleCat.getMatching(p.makeStyle()); 362 if (matches.length == 0) { 363 pStyle = p.makeStyle(); 364 String newName = new String("PPP" + ++newParaStyleNr); 365 pStyle.setName(newName); 366 styleCat.add(pStyle); 367 // DJP: write in the text format info here 368 log("created a new style"); 369 } else if (matches.length == 1) { 370 pStyle = (ParaStyle)matches[0]; 371 log("re-using a new style"); 372 } else if (firstTextRun != null) { 373 pStyle = matchParaByText(matches, firstTextRun.makeStyle()); 374 if (pStyle != null) { 375 log("resolved a (new) para by looking @ text"); 376 } else 377 log("Hey this shouldn't happen! - nr of matches is " 378 + matches.length); 379 } 380 } 381 382 if (pStyle == null) 383 log("Unable to figure out a para style"); 384 385 // Figured out a style to use. Specify the style in this 386 // paragraph's attributes. 387 currPara.setAttribute(ATTRIBUTE_TEXT_STYLE_NAME, pStyle.getName()); 388 389 bodyNode.appendChild(currPara); 390 currParaStyle = pStyle; 391 } else if (data[i].getClass() == WseTextRun.class) { 392 WseTextRun tr = (WseTextRun)data[i]; 393 TextStyle trStyle = null; 394 Node trNodes[] = parseText(tr.getText(), doc); 395 396 // First see if the formatting of this text run matches 397 // the default text formatting for this paragraph. If 398 // it does, then just make the text node(s) children of 399 // the current paragraph. 400 Style[] cps = new Style[1]; 401 cps[0] = currParaStyle; 402 if (matchParaByText(cps, tr.makeStyle()) != null) { 403 for (int ii = 0; ii < trNodes.length; ii++) { 404 currPara.appendChild(trNodes[ii]); 405 } 406 continue; 407 } 408 409 // Check for existing, matching styles in the old style 410 // catalog. If exactly one is found, use it. Otherwise, 411 // check the new style catalog, and either use the style 412 // found or add this new one to it. 413 Style matches[] = oldStyleCat.getMatching(tr.makeStyle()); 414 if (matches.length == 1) 415 trStyle = (TextStyle)matches[0]; 416 else { 417 matches = styleCat.getMatching(tr.makeStyle()); 418 if (matches.length == 0) { 419 trStyle = tr.makeStyle(); 420 String newName = new String("TTT" + ++newTextStyleNr); 421 trStyle.setName(newName); 422 styleCat.add(trStyle); 423 } else if (matches.length == 1) 424 trStyle = (TextStyle)matches[0]; 425 else 426 log("multiple text style matches from new catalog"); 427 } 428 429 // Create a text span node, set the style attribute, make the 430 // text node(s) its children, and append it to current paragraph's 431 // list of children. 432 Element textSpanNode = doc.createElement(TAG_SPAN); 433 textSpanNode.setAttribute(ATTRIBUTE_TEXT_STYLE_NAME, trStyle.getName()); 434 for (int ii = 0; ii < trNodes.length; ii++) { 435 textSpanNode.appendChild(trNodes[ii]); 436 } 437 currPara.appendChild(textSpanNode); 438 log("</SPAN>"); 439 } 440 441 else if (data[i].getClass() == WseFontTable.class) { 442 fontTable = (WseFontTable)data[i]; 443 } 444 445 else if (data[i].getClass() == WseColorTable.class) { 446 colorTable = (WseColorTable)data[i]; 447 } 448 } 449 450 451 //NodeList r = doc.getElementsByTagName(TAG_OFFICE_DOCUMENT); 452 NodeList r = doc.getElementsByTagName(TAG_OFFICE_DOCUMENT_CONTENT); 453 Node rootNode = r.item(0); 454 455 // read the original document 456 org.w3c.dom.NodeList nl; 457 if (origDoc != null) { 458 java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream(); 459 origDoc.write(bos); 460 SxwDocument origSxwDoc = new SxwDocument("old"); 461 origSxwDoc.read(new ByteArrayInputStream(bos.toByteArray())); 462 org.w3c.dom.Document origDomDoc = origSxwDoc.getContentDOM(); 463 464 XmlUtil xu = new XmlUtil(); 465 org.w3c.dom.DocumentFragment df; 466 org.w3c.dom.Node newNode; 467 468 // copy font declarations from original document to the new document 469 nl = origDomDoc.getElementsByTagName(TAG_OFFICE_FONT_DECLS); 470 df = doc.createDocumentFragment(); 471 newNode = xu.deepClone(df, nl.item(0)); 472 rootNode.insertBefore(newNode, bodyNode); 473 474 // copy style catalog from original document to the new document 475 nl = origDomDoc.getElementsByTagName(TAG_OFFICE_STYLES); 476 df = doc.createDocumentFragment(); 477 newNode = xu.deepClone(df, nl.item(0)); 478 rootNode.insertBefore(newNode, bodyNode); 479 480 nl = origDomDoc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES); 481 df = doc.createDocumentFragment(); 482 newNode = xu.deepClone(df, nl.item(0)); 483 rootNode.insertBefore(newNode, bodyNode); 484 485 nl = origDomDoc.getElementsByTagName(TAG_OFFICE_MASTER_STYLES); 486 df = doc.createDocumentFragment(); 487 newNode = xu.deepClone(df, nl.item(0)); 488 rootNode.insertBefore(newNode, bodyNode); 489 } 490 491 // Original document not specified. We need to add font declarations. 492 // DJP: this might just be for debugging. Merger will probably put 493 // the "real" ones in. 494 // DJP: if really doing it this way, do it right: gather font names 495 // from style catalog(s). 496 else { 497 org.w3c.dom.Node declNode; 498 499 log("<FONT-DECLS/>"); 500 501 declNode = doc.createElement(TAG_OFFICE_FONT_DECLS); 502 rootNode.insertBefore(declNode, bodyNode); 503 org.w3c.dom.Element fontNode; 504 505 fontNode = doc.createElement(TAG_STYLE_FONT_DECL); 506 fontNode.setAttribute(ATTRIBUTE_STYLE_NAME, "Arial"); 507 fontNode.setAttribute(ATTRIBUTE_FO_FONT_FAMILY, "Arial"); 508 fontNode.setAttribute(ATTRIBUTE_STYLE_FONT_PITCH, "variable"); 509 declNode.appendChild(fontNode); 510 511 fontNode = doc.createElement(TAG_STYLE_FONT_DECL); 512 fontNode.setAttribute(ATTRIBUTE_STYLE_NAME, "Arioso"); 513 fontNode.setAttribute(ATTRIBUTE_FO_FONT_FAMILY, "Arioso"); 514 fontNode.setAttribute(ATTRIBUTE_STYLE_FONT_PITCH, "variable"); 515 declNode.appendChild(fontNode); 516 } 517 518 519 // Now add any new styles we have created in this document. 520 nl = doc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES); 521 Node autoStylesNode = nl.item(0); 522 if (autoStylesNode == null) { 523 autoStylesNode = doc.createElement(TAG_OFFICE_AUTOMATIC_STYLES); 524 log("<OFFICE-AUTOMATIC-STYLES/>"); 525 rootNode.insertBefore(autoStylesNode, bodyNode); 526 } 527 528 Node newStyleCatNode = styleCat.writeNode(doc, "dummy"); 529 nl = newStyleCatNode.getChildNodes(); 530 int nNodes = nl.getLength(); 531 for (int i = 0; i < nNodes; i++) { 532 autoStylesNode.appendChild(nl.item(0)); 533 } 534 535 oldStyleCat.dumpCSV(true); 536 styleCat.dumpCSV(true); 537 return sxwDoc; 538 } 539 540 541 /** 542 * Sends message to the log object. 543 * 544 * @param str Debug message. 545 */ log(String str)546 private void log(String str) { 547 548 Debug.log(Debug.TRACE, str); 549 } 550 551 552 /* 553 public static void main(String args[]) { 554 555 // DocumentDeserializerImpl d = new DocumentDeserializerImpl(new InputStream()); 556 557 Node nodes[] = parseText("Tab here:\tThen some more text"); 558 } 559 */ 560 } 561 562