xref: /trunk/main/vcl/aqua/source/dtrans/HtmlFmtFlt.cxx (revision cdf0e10c)
1 #include "HtmlFmtFlt.hxx"
2 
3 #include <rtl/string.h>
4 
5 #include <string>
6 #include <sstream>
7 #include <vector>
8 #include <iomanip>
9 
10 #include <boost/assert.hpp>
11 
12 using namespace com::sun::star::uno;
13 
14 //------------------------------------------------------------------------------
15 // converts the openoffice text/html clipboard format to the HTML Format
16 // well known under MS Windows
17 // the MS HTML Format has a header before the real html data
18 //
19 // Version:1.0		Version number of the clipboard. Staring is 0.9
20 // StartHTML:		Byte count from the beginning of the clipboard to the start
21 //					of the context, or -1 if no context
22 // EndHTML:			Byte count from the beginning of the clipboard to the end
23 //					of the context, or -1 if no context
24 // StartFragment:	Byte count from the beginning of the clipboard to the
25 //					start of the fragment
26 // EndFragment:		Byte count from the beginning of the clipboard to the
27 //					end of the fragment
28 // StartSelection:	Byte count from the beginning of the clipboard to the
29 //					start of the selection
30 // EndSelection:	Byte count from the beginning of the clipboard to the
31 //					end of the selection
32 //
33 // StartSelection and EndSelection are optional
34 // The fragment should be preceded and followed by the HTML comments
35 // <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the
36 // text
37 //------------------------------------------------------------------------------
38 
39 namespace // private
40 {
41 std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment, size_t endFragment)
42 {
43     std::ostringstream htmlHeader;
44     htmlHeader << "Version:1.0" << '\r' << '\n';
45     htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml << '\r' << '\n';
46 	htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r' << '\n';
47 	htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec << startFragment << '\r' << '\n';
48 	htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment << '\r' << '\n';
49     return htmlHeader.str();
50 }
51 
52 } // namespace private
53 
54 
55 // the office allways writes the start and end html tag in upper cases and
56 // without spaces both tags don't allow parameters
57 const std::string TAG_HTML = std::string("<HTML>");
58 const std::string TAG_END_HTML = std::string("</HTML>");
59 
60 // The body tag may have parameters so we need to search for the
61 // closing '>' manually e.g. <BODY param> #92840#
62 const std::string TAG_BODY = std::string("<BODY");
63 const std::string TAG_END_BODY = std::string("</BODY");
64 
65 Sequence<sal_Int8> SAL_CALL TextHtmlToHTMLFormat(Sequence<sal_Int8>& aTextHtml)
66 {
67 	OSL_ASSERT(aTextHtml.getLength() > 0);
68 
69 	if (!(aTextHtml.getLength() > 0))
70 		return Sequence<sal_Int8>();
71 
72 	// fill the buffer with dummy values to calc the exact length
73     std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0);
74 	size_t lHtmlFormatHeader = dummyHtmlHeader.length();
75 
76 	std::string textHtml(
77 	    reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()),
78 		reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()) + aTextHtml.getLength());
79 
80 	std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader - 1; // we start one before '<HTML>' Word 2000 does also so
81 	std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader + TAG_END_HTML.length() + 1; // our SOffice 5.2 wants 2 behind </HTML>?
82 
83 	// The body tag may have parameters so we need to search for the
84 	// closing '>' manually e.g. <BODY param> #92840#
85 	std::string::size_type nStartFragment = textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1;
86 	std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader;
87 
88 	std::string htmlFormat = GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment);
89 	htmlFormat += textHtml;
90 
91 	Sequence<sal_Int8> byteSequence(htmlFormat.length() + 1); // space the trailing '\0'
92 	rtl_zeroMemory(byteSequence.getArray(), byteSequence.getLength());
93 
94 	rtl_copyMemory(
95 		static_cast<void*>(byteSequence.getArray()),
96 		static_cast<const void*>(htmlFormat.c_str()),
97 		htmlFormat.length());
98 
99 	return byteSequence;
100 }
101 
102 const char* HtmlStartTag = "<html";
103 
104 Sequence<sal_Int8> HTMLFormatToTextHtml(const Sequence<sal_Int8>& aHTMLFormat)
105 {
106   BOOST_ASSERT(isHTMLFormat(aHTMLFormat) && "No HTML Format provided");
107 
108   Sequence<sal_Int8>& nonconstHTMLFormatRef = const_cast< Sequence<sal_Int8>& >(aHTMLFormat);
109   sal_Char* dataStart = reinterpret_cast<sal_Char*>(nonconstHTMLFormatRef.getArray());
110   sal_Char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1;
111   const sal_Char* htmlStartTag = strcasestr(dataStart, HtmlStartTag);
112 
113   BOOST_ASSERT(htmlStartTag && "Seems to be no HTML at all");
114 
115   // It doesn't seem to be HTML? Well then simply return what has been
116   // provided in non-debug builds
117   if (htmlStartTag == NULL)
118 	{
119 	return aHTMLFormat;
120 	}
121 
122   sal_Int32 len = dataEnd - htmlStartTag;
123   Sequence<sal_Int8> plainHtmlData(len);
124 
125   rtl_copyMemory(static_cast<void*>(plainHtmlData.getArray()), htmlStartTag, len);
126 
127   return plainHtmlData;
128 }
129 
130 /* A simple format detection. We are just comparing the first few bytes
131    of the provided byte sequence to see whether or not it is the MS
132    Office Html format. If it shows that this is not reliable enough we
133    can improve this
134 */
135 const char HtmlFormatStart[] = "Version:";
136 int HtmlFormatStartLen = (sizeof(HtmlFormatStart) - 1);
137 
138 bool isHTMLFormat(const Sequence<sal_Int8>& aHtmlSequence)
139 {
140   if (aHtmlSequence.getLength() < HtmlFormatStartLen)
141 	return false;
142 
143   return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart,
144 												   HtmlFormatStartLen,
145 												   reinterpret_cast<const sal_Char*>(aHtmlSequence.getConstArray()),
146 												   HtmlFormatStartLen) == 0;
147 }
148