1*9f62ea84SAndrew Rist /**************************************************************
2*9f62ea84SAndrew Rist  *
3*9f62ea84SAndrew Rist  * Licensed to the Apache Software Foundation (ASF) under one
4*9f62ea84SAndrew Rist  * or more contributor license agreements.  See the NOTICE file
5*9f62ea84SAndrew Rist  * distributed with this work for additional information
6*9f62ea84SAndrew Rist  * regarding copyright ownership.  The ASF licenses this file
7*9f62ea84SAndrew Rist  * to you under the Apache License, Version 2.0 (the
8*9f62ea84SAndrew Rist  * "License"); you may not use this file except in compliance
9*9f62ea84SAndrew Rist  * with the License.  You may obtain a copy of the License at
10*9f62ea84SAndrew Rist  *
11*9f62ea84SAndrew Rist  *   http://www.apache.org/licenses/LICENSE-2.0
12*9f62ea84SAndrew Rist  *
13*9f62ea84SAndrew Rist  * Unless required by applicable law or agreed to in writing,
14*9f62ea84SAndrew Rist  * software distributed under the License is distributed on an
15*9f62ea84SAndrew Rist  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*9f62ea84SAndrew Rist  * KIND, either express or implied.  See the License for the
17*9f62ea84SAndrew Rist  * specific language governing permissions and limitations
18*9f62ea84SAndrew Rist  * under the License.
19*9f62ea84SAndrew Rist  *
20*9f62ea84SAndrew Rist  *************************************************************/
21*9f62ea84SAndrew Rist 
22cdf0e10cSrcweir #include "HtmlFmtFlt.hxx"
23cdf0e10cSrcweir 
24cdf0e10cSrcweir #include <rtl/string.h>
25cdf0e10cSrcweir 
26cdf0e10cSrcweir #include <string>
27cdf0e10cSrcweir #include <sstream>
28cdf0e10cSrcweir #include <vector>
29cdf0e10cSrcweir #include <iomanip>
30cdf0e10cSrcweir 
31cdf0e10cSrcweir #include <boost/assert.hpp>
32cdf0e10cSrcweir 
33cdf0e10cSrcweir using namespace com::sun::star::uno;
34cdf0e10cSrcweir 
35cdf0e10cSrcweir //------------------------------------------------------------------------------
36cdf0e10cSrcweir // converts the openoffice text/html clipboard format to the HTML Format
37cdf0e10cSrcweir // well known under MS Windows
38cdf0e10cSrcweir // the MS HTML Format has a header before the real html data
39cdf0e10cSrcweir //
40cdf0e10cSrcweir // Version:1.0		Version number of the clipboard. Staring is 0.9
41cdf0e10cSrcweir // StartHTML:		Byte count from the beginning of the clipboard to the start
42cdf0e10cSrcweir //					of the context, or -1 if no context
43cdf0e10cSrcweir // EndHTML:			Byte count from the beginning of the clipboard to the end
44cdf0e10cSrcweir //					of the context, or -1 if no context
45cdf0e10cSrcweir // StartFragment:	Byte count from the beginning of the clipboard to the
46cdf0e10cSrcweir //					start of the fragment
47cdf0e10cSrcweir // EndFragment:		Byte count from the beginning of the clipboard to the
48cdf0e10cSrcweir //					end of the fragment
49cdf0e10cSrcweir // StartSelection:	Byte count from the beginning of the clipboard to the
50cdf0e10cSrcweir //					start of the selection
51cdf0e10cSrcweir // EndSelection:	Byte count from the beginning of the clipboard to the
52cdf0e10cSrcweir //					end of the selection
53cdf0e10cSrcweir //
54cdf0e10cSrcweir // StartSelection and EndSelection are optional
55cdf0e10cSrcweir // The fragment should be preceded and followed by the HTML comments
56cdf0e10cSrcweir // <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the
57cdf0e10cSrcweir // text
58cdf0e10cSrcweir //------------------------------------------------------------------------------
59cdf0e10cSrcweir 
60cdf0e10cSrcweir namespace // private
61cdf0e10cSrcweir {
GetHtmlFormatHeader(size_t startHtml,size_t endHtml,size_t startFragment,size_t endFragment)62cdf0e10cSrcweir std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment, size_t endFragment)
63cdf0e10cSrcweir {
64cdf0e10cSrcweir     std::ostringstream htmlHeader;
65cdf0e10cSrcweir     htmlHeader << "Version:1.0" << '\r' << '\n';
66cdf0e10cSrcweir     htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml << '\r' << '\n';
67cdf0e10cSrcweir 	htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r' << '\n';
68cdf0e10cSrcweir 	htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec << startFragment << '\r' << '\n';
69cdf0e10cSrcweir 	htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment << '\r' << '\n';
70cdf0e10cSrcweir     return htmlHeader.str();
71cdf0e10cSrcweir }
72cdf0e10cSrcweir 
73cdf0e10cSrcweir } // namespace private
74cdf0e10cSrcweir 
75cdf0e10cSrcweir 
76cdf0e10cSrcweir // the office allways writes the start and end html tag in upper cases and
77cdf0e10cSrcweir // without spaces both tags don't allow parameters
78cdf0e10cSrcweir const std::string TAG_HTML = std::string("<HTML>");
79cdf0e10cSrcweir const std::string TAG_END_HTML = std::string("</HTML>");
80cdf0e10cSrcweir 
81cdf0e10cSrcweir // The body tag may have parameters so we need to search for the
82cdf0e10cSrcweir // closing '>' manually e.g. <BODY param> #92840#
83cdf0e10cSrcweir const std::string TAG_BODY = std::string("<BODY");
84cdf0e10cSrcweir const std::string TAG_END_BODY = std::string("</BODY");
85cdf0e10cSrcweir 
TextHtmlToHTMLFormat(Sequence<sal_Int8> & aTextHtml)86cdf0e10cSrcweir Sequence<sal_Int8> SAL_CALL TextHtmlToHTMLFormat(Sequence<sal_Int8>& aTextHtml)
87cdf0e10cSrcweir {
88cdf0e10cSrcweir 	OSL_ASSERT(aTextHtml.getLength() > 0);
89cdf0e10cSrcweir 
90cdf0e10cSrcweir 	if (!(aTextHtml.getLength() > 0))
91cdf0e10cSrcweir 		return Sequence<sal_Int8>();
92cdf0e10cSrcweir 
93cdf0e10cSrcweir 	// fill the buffer with dummy values to calc the exact length
94cdf0e10cSrcweir     std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0);
95cdf0e10cSrcweir 	size_t lHtmlFormatHeader = dummyHtmlHeader.length();
96cdf0e10cSrcweir 
97cdf0e10cSrcweir 	std::string textHtml(
98cdf0e10cSrcweir 	    reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()),
99cdf0e10cSrcweir 		reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()) + aTextHtml.getLength());
100cdf0e10cSrcweir 
101cdf0e10cSrcweir 	std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader - 1; // we start one before '<HTML>' Word 2000 does also so
102cdf0e10cSrcweir 	std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader + TAG_END_HTML.length() + 1; // our SOffice 5.2 wants 2 behind </HTML>?
103cdf0e10cSrcweir 
104cdf0e10cSrcweir 	// The body tag may have parameters so we need to search for the
105cdf0e10cSrcweir 	// closing '>' manually e.g. <BODY param> #92840#
106cdf0e10cSrcweir 	std::string::size_type nStartFragment = textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1;
107cdf0e10cSrcweir 	std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader;
108cdf0e10cSrcweir 
109cdf0e10cSrcweir 	std::string htmlFormat = GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment);
110cdf0e10cSrcweir 	htmlFormat += textHtml;
111cdf0e10cSrcweir 
112cdf0e10cSrcweir 	Sequence<sal_Int8> byteSequence(htmlFormat.length() + 1); // space the trailing '\0'
113cdf0e10cSrcweir 	rtl_zeroMemory(byteSequence.getArray(), byteSequence.getLength());
114cdf0e10cSrcweir 
115cdf0e10cSrcweir 	rtl_copyMemory(
116cdf0e10cSrcweir 		static_cast<void*>(byteSequence.getArray()),
117cdf0e10cSrcweir 		static_cast<const void*>(htmlFormat.c_str()),
118cdf0e10cSrcweir 		htmlFormat.length());
119cdf0e10cSrcweir 
120cdf0e10cSrcweir 	return byteSequence;
121cdf0e10cSrcweir }
122cdf0e10cSrcweir 
123cdf0e10cSrcweir const char* HtmlStartTag = "<html";
124cdf0e10cSrcweir 
HTMLFormatToTextHtml(const Sequence<sal_Int8> & aHTMLFormat)125cdf0e10cSrcweir Sequence<sal_Int8> HTMLFormatToTextHtml(const Sequence<sal_Int8>& aHTMLFormat)
126cdf0e10cSrcweir {
127cdf0e10cSrcweir   BOOST_ASSERT(isHTMLFormat(aHTMLFormat) && "No HTML Format provided");
128cdf0e10cSrcweir 
129cdf0e10cSrcweir   Sequence<sal_Int8>& nonconstHTMLFormatRef = const_cast< Sequence<sal_Int8>& >(aHTMLFormat);
130cdf0e10cSrcweir   sal_Char* dataStart = reinterpret_cast<sal_Char*>(nonconstHTMLFormatRef.getArray());
131cdf0e10cSrcweir   sal_Char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1;
132cdf0e10cSrcweir   const sal_Char* htmlStartTag = strcasestr(dataStart, HtmlStartTag);
133cdf0e10cSrcweir 
134cdf0e10cSrcweir   BOOST_ASSERT(htmlStartTag && "Seems to be no HTML at all");
135cdf0e10cSrcweir 
136cdf0e10cSrcweir   // It doesn't seem to be HTML? Well then simply return what has been
137cdf0e10cSrcweir   // provided in non-debug builds
138cdf0e10cSrcweir   if (htmlStartTag == NULL)
139cdf0e10cSrcweir 	{
140cdf0e10cSrcweir 	return aHTMLFormat;
141cdf0e10cSrcweir 	}
142cdf0e10cSrcweir 
143cdf0e10cSrcweir   sal_Int32 len = dataEnd - htmlStartTag;
144cdf0e10cSrcweir   Sequence<sal_Int8> plainHtmlData(len);
145cdf0e10cSrcweir 
146cdf0e10cSrcweir   rtl_copyMemory(static_cast<void*>(plainHtmlData.getArray()), htmlStartTag, len);
147cdf0e10cSrcweir 
148cdf0e10cSrcweir   return plainHtmlData;
149cdf0e10cSrcweir }
150cdf0e10cSrcweir 
151cdf0e10cSrcweir /* A simple format detection. We are just comparing the first few bytes
152cdf0e10cSrcweir    of the provided byte sequence to see whether or not it is the MS
153cdf0e10cSrcweir    Office Html format. If it shows that this is not reliable enough we
154cdf0e10cSrcweir    can improve this
155cdf0e10cSrcweir */
156cdf0e10cSrcweir const char HtmlFormatStart[] = "Version:";
157cdf0e10cSrcweir int HtmlFormatStartLen = (sizeof(HtmlFormatStart) - 1);
158cdf0e10cSrcweir 
isHTMLFormat(const Sequence<sal_Int8> & aHtmlSequence)159cdf0e10cSrcweir bool isHTMLFormat(const Sequence<sal_Int8>& aHtmlSequence)
160cdf0e10cSrcweir {
161cdf0e10cSrcweir   if (aHtmlSequence.getLength() < HtmlFormatStartLen)
162cdf0e10cSrcweir 	return false;
163cdf0e10cSrcweir 
164cdf0e10cSrcweir   return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart,
165cdf0e10cSrcweir 												   HtmlFormatStartLen,
166cdf0e10cSrcweir 												   reinterpret_cast<const sal_Char*>(aHtmlSequence.getConstArray()),
167cdf0e10cSrcweir 												   HtmlFormatStartLen) == 0;
168cdf0e10cSrcweir }
169