1*9f62ea84SAndrew Rist /**************************************************************
2*9f62ea84SAndrew Rist *
3*9f62ea84SAndrew Rist * Licensed to the Apache Software Foundation (ASF) under one
4*9f62ea84SAndrew Rist * or more contributor license agreements. See the NOTICE file
5*9f62ea84SAndrew Rist * distributed with this work for additional information
6*9f62ea84SAndrew Rist * regarding copyright ownership. The ASF licenses this file
7*9f62ea84SAndrew Rist * to you under the Apache License, Version 2.0 (the
8*9f62ea84SAndrew Rist * "License"); you may not use this file except in compliance
9*9f62ea84SAndrew Rist * with the License. You may obtain a copy of the License at
10*9f62ea84SAndrew Rist *
11*9f62ea84SAndrew Rist * http://www.apache.org/licenses/LICENSE-2.0
12*9f62ea84SAndrew Rist *
13*9f62ea84SAndrew Rist * Unless required by applicable law or agreed to in writing,
14*9f62ea84SAndrew Rist * software distributed under the License is distributed on an
15*9f62ea84SAndrew Rist * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*9f62ea84SAndrew Rist * KIND, either express or implied. See the License for the
17*9f62ea84SAndrew Rist * specific language governing permissions and limitations
18*9f62ea84SAndrew Rist * under the License.
19*9f62ea84SAndrew Rist *
20*9f62ea84SAndrew Rist *************************************************************/
21*9f62ea84SAndrew Rist
22cdf0e10cSrcweir #include "HtmlFmtFlt.hxx"
23cdf0e10cSrcweir
24cdf0e10cSrcweir #include <rtl/string.h>
25cdf0e10cSrcweir
26cdf0e10cSrcweir #include <string>
27cdf0e10cSrcweir #include <sstream>
28cdf0e10cSrcweir #include <vector>
29cdf0e10cSrcweir #include <iomanip>
30cdf0e10cSrcweir
31cdf0e10cSrcweir #include <boost/assert.hpp>
32cdf0e10cSrcweir
33cdf0e10cSrcweir using namespace com::sun::star::uno;
34cdf0e10cSrcweir
35cdf0e10cSrcweir //------------------------------------------------------------------------------
36cdf0e10cSrcweir // converts the openoffice text/html clipboard format to the HTML Format
37cdf0e10cSrcweir // well known under MS Windows
38cdf0e10cSrcweir // the MS HTML Format has a header before the real html data
39cdf0e10cSrcweir //
40cdf0e10cSrcweir // Version:1.0 Version number of the clipboard. Staring is 0.9
41cdf0e10cSrcweir // StartHTML: Byte count from the beginning of the clipboard to the start
42cdf0e10cSrcweir // of the context, or -1 if no context
43cdf0e10cSrcweir // EndHTML: Byte count from the beginning of the clipboard to the end
44cdf0e10cSrcweir // of the context, or -1 if no context
45cdf0e10cSrcweir // StartFragment: Byte count from the beginning of the clipboard to the
46cdf0e10cSrcweir // start of the fragment
47cdf0e10cSrcweir // EndFragment: Byte count from the beginning of the clipboard to the
48cdf0e10cSrcweir // end of the fragment
49cdf0e10cSrcweir // StartSelection: Byte count from the beginning of the clipboard to the
50cdf0e10cSrcweir // start of the selection
51cdf0e10cSrcweir // EndSelection: Byte count from the beginning of the clipboard to the
52cdf0e10cSrcweir // end of the selection
53cdf0e10cSrcweir //
54cdf0e10cSrcweir // StartSelection and EndSelection are optional
55cdf0e10cSrcweir // The fragment should be preceded and followed by the HTML comments
56cdf0e10cSrcweir // <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the
57cdf0e10cSrcweir // text
58cdf0e10cSrcweir //------------------------------------------------------------------------------
59cdf0e10cSrcweir
60cdf0e10cSrcweir namespace // private
61cdf0e10cSrcweir {
GetHtmlFormatHeader(size_t startHtml,size_t endHtml,size_t startFragment,size_t endFragment)62cdf0e10cSrcweir std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment, size_t endFragment)
63cdf0e10cSrcweir {
64cdf0e10cSrcweir std::ostringstream htmlHeader;
65cdf0e10cSrcweir htmlHeader << "Version:1.0" << '\r' << '\n';
66cdf0e10cSrcweir htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml << '\r' << '\n';
67cdf0e10cSrcweir htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r' << '\n';
68cdf0e10cSrcweir htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec << startFragment << '\r' << '\n';
69cdf0e10cSrcweir htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment << '\r' << '\n';
70cdf0e10cSrcweir return htmlHeader.str();
71cdf0e10cSrcweir }
72cdf0e10cSrcweir
73cdf0e10cSrcweir } // namespace private
74cdf0e10cSrcweir
75cdf0e10cSrcweir
76cdf0e10cSrcweir // the office allways writes the start and end html tag in upper cases and
77cdf0e10cSrcweir // without spaces both tags don't allow parameters
78cdf0e10cSrcweir const std::string TAG_HTML = std::string("<HTML>");
79cdf0e10cSrcweir const std::string TAG_END_HTML = std::string("</HTML>");
80cdf0e10cSrcweir
81cdf0e10cSrcweir // The body tag may have parameters so we need to search for the
82cdf0e10cSrcweir // closing '>' manually e.g. <BODY param> #92840#
83cdf0e10cSrcweir const std::string TAG_BODY = std::string("<BODY");
84cdf0e10cSrcweir const std::string TAG_END_BODY = std::string("</BODY");
85cdf0e10cSrcweir
TextHtmlToHTMLFormat(Sequence<sal_Int8> & aTextHtml)86cdf0e10cSrcweir Sequence<sal_Int8> SAL_CALL TextHtmlToHTMLFormat(Sequence<sal_Int8>& aTextHtml)
87cdf0e10cSrcweir {
88cdf0e10cSrcweir OSL_ASSERT(aTextHtml.getLength() > 0);
89cdf0e10cSrcweir
90cdf0e10cSrcweir if (!(aTextHtml.getLength() > 0))
91cdf0e10cSrcweir return Sequence<sal_Int8>();
92cdf0e10cSrcweir
93cdf0e10cSrcweir // fill the buffer with dummy values to calc the exact length
94cdf0e10cSrcweir std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0);
95cdf0e10cSrcweir size_t lHtmlFormatHeader = dummyHtmlHeader.length();
96cdf0e10cSrcweir
97cdf0e10cSrcweir std::string textHtml(
98cdf0e10cSrcweir reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()),
99cdf0e10cSrcweir reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()) + aTextHtml.getLength());
100cdf0e10cSrcweir
101cdf0e10cSrcweir std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader - 1; // we start one before '<HTML>' Word 2000 does also so
102cdf0e10cSrcweir std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader + TAG_END_HTML.length() + 1; // our SOffice 5.2 wants 2 behind </HTML>?
103cdf0e10cSrcweir
104cdf0e10cSrcweir // The body tag may have parameters so we need to search for the
105cdf0e10cSrcweir // closing '>' manually e.g. <BODY param> #92840#
106cdf0e10cSrcweir std::string::size_type nStartFragment = textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1;
107cdf0e10cSrcweir std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader;
108cdf0e10cSrcweir
109cdf0e10cSrcweir std::string htmlFormat = GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment);
110cdf0e10cSrcweir htmlFormat += textHtml;
111cdf0e10cSrcweir
112cdf0e10cSrcweir Sequence<sal_Int8> byteSequence(htmlFormat.length() + 1); // space the trailing '\0'
113cdf0e10cSrcweir rtl_zeroMemory(byteSequence.getArray(), byteSequence.getLength());
114cdf0e10cSrcweir
115cdf0e10cSrcweir rtl_copyMemory(
116cdf0e10cSrcweir static_cast<void*>(byteSequence.getArray()),
117cdf0e10cSrcweir static_cast<const void*>(htmlFormat.c_str()),
118cdf0e10cSrcweir htmlFormat.length());
119cdf0e10cSrcweir
120cdf0e10cSrcweir return byteSequence;
121cdf0e10cSrcweir }
122cdf0e10cSrcweir
123cdf0e10cSrcweir const char* HtmlStartTag = "<html";
124cdf0e10cSrcweir
HTMLFormatToTextHtml(const Sequence<sal_Int8> & aHTMLFormat)125cdf0e10cSrcweir Sequence<sal_Int8> HTMLFormatToTextHtml(const Sequence<sal_Int8>& aHTMLFormat)
126cdf0e10cSrcweir {
127cdf0e10cSrcweir BOOST_ASSERT(isHTMLFormat(aHTMLFormat) && "No HTML Format provided");
128cdf0e10cSrcweir
129cdf0e10cSrcweir Sequence<sal_Int8>& nonconstHTMLFormatRef = const_cast< Sequence<sal_Int8>& >(aHTMLFormat);
130cdf0e10cSrcweir sal_Char* dataStart = reinterpret_cast<sal_Char*>(nonconstHTMLFormatRef.getArray());
131cdf0e10cSrcweir sal_Char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1;
132cdf0e10cSrcweir const sal_Char* htmlStartTag = strcasestr(dataStart, HtmlStartTag);
133cdf0e10cSrcweir
134cdf0e10cSrcweir BOOST_ASSERT(htmlStartTag && "Seems to be no HTML at all");
135cdf0e10cSrcweir
136cdf0e10cSrcweir // It doesn't seem to be HTML? Well then simply return what has been
137cdf0e10cSrcweir // provided in non-debug builds
138cdf0e10cSrcweir if (htmlStartTag == NULL)
139cdf0e10cSrcweir {
140cdf0e10cSrcweir return aHTMLFormat;
141cdf0e10cSrcweir }
142cdf0e10cSrcweir
143cdf0e10cSrcweir sal_Int32 len = dataEnd - htmlStartTag;
144cdf0e10cSrcweir Sequence<sal_Int8> plainHtmlData(len);
145cdf0e10cSrcweir
146cdf0e10cSrcweir rtl_copyMemory(static_cast<void*>(plainHtmlData.getArray()), htmlStartTag, len);
147cdf0e10cSrcweir
148cdf0e10cSrcweir return plainHtmlData;
149cdf0e10cSrcweir }
150cdf0e10cSrcweir
151cdf0e10cSrcweir /* A simple format detection. We are just comparing the first few bytes
152cdf0e10cSrcweir of the provided byte sequence to see whether or not it is the MS
153cdf0e10cSrcweir Office Html format. If it shows that this is not reliable enough we
154cdf0e10cSrcweir can improve this
155cdf0e10cSrcweir */
156cdf0e10cSrcweir const char HtmlFormatStart[] = "Version:";
157cdf0e10cSrcweir int HtmlFormatStartLen = (sizeof(HtmlFormatStart) - 1);
158cdf0e10cSrcweir
isHTMLFormat(const Sequence<sal_Int8> & aHtmlSequence)159cdf0e10cSrcweir bool isHTMLFormat(const Sequence<sal_Int8>& aHtmlSequence)
160cdf0e10cSrcweir {
161cdf0e10cSrcweir if (aHtmlSequence.getLength() < HtmlFormatStartLen)
162cdf0e10cSrcweir return false;
163cdf0e10cSrcweir
164cdf0e10cSrcweir return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart,
165cdf0e10cSrcweir HtmlFormatStartLen,
166cdf0e10cSrcweir reinterpret_cast<const sal_Char*>(aHtmlSequence.getConstArray()),
167cdf0e10cSrcweir HtmlFormatStartLen) == 0;
168cdf0e10cSrcweir }
169