1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 /**
25 *
26 *
27 *
28 *
29 * TODO
30 * - Add exception throwing when h == NULL
31 * - Not init h when implicit constructor is launched
32 */
33
34 // MARKER(update_precomp.py): autogen include statement, do not remove
35 #include "precompiled_guesslang.hxx"
36
37 #include <string.h>
38 #include <sstream>
39 #include <iostream>
40
41 #include <libtextcat/textcat.h>
42 #include <libtextcat/common.h>
43 #include <libtextcat/constants.h>
44 #include <libtextcat/fingerprint.h>
45 #include <libtextcat/utf8misc.h>
46
47 #include <sal/types.h>
48
49 #include "altstrfunc.hxx"
50 #include "simpleguesser.hxx"
51
52 #ifndef _UTF8_
53 #define _UTF8_
54 #endif
55
56
57 using namespace std;
58
59
60 /**
61 * This 3 following structures are from fingerprint.c and textcat.c
62 */
63
64 typedef struct ngram_t {
65
66 sint2 rank;
67 char str[MAXNGRAMSIZE+1];
68
69 } ngram_t;
70
71 typedef struct fp_t {
72
73 const char *name;
74 ngram_t *fprint;
75 uint4 size;
76
77 } fp_t;
78
79 typedef struct textcat_t{
80
81 void **fprint;
82 char *fprint_disable;
83 uint4 size;
84 uint4 maxsize;
85
86 char output[MAXOUTPUTSIZE];
87
88 } textcat_t;
89 /** end of the 3 structs */
90
SimpleGuesser()91 SimpleGuesser::SimpleGuesser()
92 {
93 h = NULL;
94 }
95
operator =(SimpleGuesser & sg)96 void SimpleGuesser::operator=(SimpleGuesser& sg){
97 if(h){textcat_Done(h);}
98 h = sg.h;
99 }
100
~SimpleGuesser()101 SimpleGuesser::~SimpleGuesser()
102 {
103 if(h){textcat_Done(h);}
104 }
105
106
107 /*!
108 \fn SimpleGuesser::GuessLanguage(char* text)
109 */
GuessLanguage(char * text)110 vector<Guess> SimpleGuesser::GuessLanguage(char* text)
111 {
112 vector<Guess> guesses;
113
114 if(!h){return guesses;}
115
116 //calculate le number of unicode charcters (symbols)
117 int len = utfstrlen(text);
118
119 if( len > MAX_STRING_LENGTH_TO_ANALYSE ){len = MAX_STRING_LENGTH_TO_ANALYSE ;}
120
121 char *guess_list = textcat_Classify(h, text, len);
122
123 if(strcmp(guess_list, _TEXTCAT_RESULT_SHORT) == 0){
124 return guesses;
125 }
126
127 int current_pointer = 0;
128
129 for(int i = 0; guess_list[current_pointer] != '\0'; i++)
130 {
131 while(guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0'){
132 current_pointer++;
133 }
134 if(guess_list[current_pointer] != '\0')
135 {
136 Guess g((char*)(guess_list + current_pointer));
137
138 guesses.push_back(g);
139
140 current_pointer++;
141 }
142 }
143
144 return guesses;
145 }
146
147 /*!
148 \fn SimpleGuesser::GuessPrimaryLanguage(char* text)
149 */
GuessPrimaryLanguage(char * text)150 Guess SimpleGuesser::GuessPrimaryLanguage(char* text)
151 {
152 vector<Guess> ret = GuessLanguage(text);
153 if(ret.size() > 0){
154 return GuessLanguage(text)[0];
155 }
156 else{
157 return Guess();
158 }
159 }
160 /**
161 * Is used to know which language is available, unavailable or both
162 * when mask = 0xF0, return only Available
163 * when mask = 0x0F, return only Unavailable
164 * when mask = 0xFF, return both Available and Unavailable
165 */
GetManagedLanguages(const char mask)166 vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask)
167 {
168 size_t i;
169 textcat_t *tables = (textcat_t*)h;
170
171 vector<Guess> lang;
172 if(!h){return lang;}
173
174 for (i=0; i<tables->size; i++) {
175 if(tables->fprint_disable[i] & mask){
176 string langStr = "[";
177 langStr += (char*)fp_Name(tables->fprint[i]);
178 Guess g( (char *)langStr.c_str());
179 lang.push_back(g);
180 }
181 }
182
183 return lang;
184 }
185
GetAvailableLanguages()186 vector<Guess> SimpleGuesser::GetAvailableLanguages(){
187 return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) );
188 }
189
GetUnavailableLanguages()190 vector<Guess> SimpleGuesser::GetUnavailableLanguages(){
191 return GetManagedLanguages( sal::static_int_cast< char >( 0x0F ));
192 }
193
GetAllManagedLanguages()194 vector<Guess> SimpleGuesser::GetAllManagedLanguages(){
195 return GetManagedLanguages( sal::static_int_cast< char >( 0xFF ));
196 }
197
XableLanguage(string lang,char mask)198 void SimpleGuesser::XableLanguage(string lang, char mask){
199 size_t i;
200 textcat_t *tables = (textcat_t*)h;
201
202 if(!h){return;}
203
204 for (i=0; i<tables->size; i++) {
205 string language(fp_Name(tables->fprint[i]));
206 if(start(language,lang) == 0){
207 //cout << language << endl;
208 tables->fprint_disable[i] = mask;
209 //continue;
210 }
211 }
212 }
213
EnableLanguage(string lang)214 void SimpleGuesser::EnableLanguage(string lang){
215 XableLanguage(lang, sal::static_int_cast< char >( 0xF0 ));
216 }
217
DisableLanguage(string lang)218 void SimpleGuesser::DisableLanguage(string lang){
219 XableLanguage(lang, sal::static_int_cast< char >( 0x0F ));
220 }
221
222 /**
223 *
224 */
SetDBPath(const char * path,const char * prefix)225 void SimpleGuesser::SetDBPath(const char* path, const char* prefix){
226 if(h){
227 textcat_Done(h);
228 }
229 h = special_textcat_Init(path, prefix);
230 }
231