1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24  /**
25   *
26   *
27   *
28   *
29   * TODO
30   * - Add exception throwing when h == NULL
31   * - Not init h when implicit constructor is launched
32   */
33 
34 // MARKER(update_precomp.py): autogen include statement, do not remove
35 #include "precompiled_guesslang.hxx"
36 
37 #include <string.h>
38 #include <sstream>
39 #include <iostream>
40 
41 #include <libtextcat/textcat.h>
42 #include <libtextcat/common.h>
43 #include <libtextcat/constants.h>
44 #include <libtextcat/fingerprint.h>
45 #include <libtextcat/utf8misc.h>
46 
47 #include <sal/types.h>
48 
49 #include "altstrfunc.hxx"
50 #include "simpleguesser.hxx"
51 
52 #ifndef _UTF8_
53 #define _UTF8_
54 #endif
55 
56 
57 using namespace std;
58 
59 
60 /**
61  * This 3 following structures are from fingerprint.c and textcat.c
62  */
63 
64 typedef struct ngram_t {
65 
66     sint2 rank;
67     char str[MAXNGRAMSIZE+1];
68 
69 } ngram_t;
70 
71 typedef struct fp_t {
72 
73     const char *name;
74     ngram_t *fprint;
75     uint4 size;
76 
77 } fp_t;
78 
79 typedef struct textcat_t{
80 
81     void **fprint;
82     char *fprint_disable;
83     uint4 size;
84     uint4 maxsize;
85 
86     char output[MAXOUTPUTSIZE];
87 
88 } textcat_t;
89 /** end of the 3 structs */
90 
SimpleGuesser()91 SimpleGuesser::SimpleGuesser()
92 {
93     h = NULL;
94 }
95 
operator =(SimpleGuesser & sg)96 void SimpleGuesser::operator=(SimpleGuesser& sg){
97     if(h){textcat_Done(h);}
98     h = sg.h;
99 }
100 
~SimpleGuesser()101 SimpleGuesser::~SimpleGuesser()
102 {
103     if(h){textcat_Done(h);}
104 }
105 
106 
107 /*!
108     \fn SimpleGuesser::GuessLanguage(char* text)
109  */
GuessLanguage(char * text)110 vector<Guess> SimpleGuesser::GuessLanguage(char* text)
111 {
112         vector<Guess> guesses;
113 
114         if(!h){return guesses;}
115 
116         //calculate le number of unicode charcters (symbols)
117         int len = utfstrlen(text);
118 
119 	if( len > MAX_STRING_LENGTH_TO_ANALYSE ){len = MAX_STRING_LENGTH_TO_ANALYSE ;}
120 
121         char *guess_list = textcat_Classify(h, text, len);
122 
123         if(strcmp(guess_list, _TEXTCAT_RESULT_SHORT) == 0){
124             return guesses;
125         }
126 
127         int current_pointer = 0;
128 
129         for(int i = 0; guess_list[current_pointer] != '\0'; i++)
130         {
131             while(guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0'){
132                 current_pointer++;
133             }
134             if(guess_list[current_pointer] != '\0')
135             {
136                 Guess g((char*)(guess_list + current_pointer));
137 
138                 guesses.push_back(g);
139 
140                 current_pointer++;
141             }
142         }
143 
144 	return guesses;
145 }
146 
147 /*!
148     \fn SimpleGuesser::GuessPrimaryLanguage(char* text)
149  */
GuessPrimaryLanguage(char * text)150 Guess SimpleGuesser::GuessPrimaryLanguage(char* text)
151 {
152     vector<Guess> ret = GuessLanguage(text);
153     if(ret.size() > 0){
154         return GuessLanguage(text)[0];
155     }
156     else{
157         return Guess();
158     }
159 }
160 /**
161  * Is used to know which language is available, unavailable or both
162  * when mask = 0xF0, return only Available
163  * when mask = 0x0F, return only Unavailable
164  * when mask = 0xFF, return both Available and Unavailable
165  */
GetManagedLanguages(const char mask)166 vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask)
167 {
168     size_t i;
169     textcat_t *tables = (textcat_t*)h;
170 
171     vector<Guess> lang;
172     if(!h){return lang;}
173 
174     for (i=0; i<tables->size; i++) {
175         if(tables->fprint_disable[i] & mask){
176             string langStr = "[";
177             langStr += (char*)fp_Name(tables->fprint[i]);
178             Guess g( (char *)langStr.c_str());
179             lang.push_back(g);
180         }
181     }
182 
183     return lang;
184 }
185 
GetAvailableLanguages()186 vector<Guess> SimpleGuesser::GetAvailableLanguages(){
187     return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) );
188 }
189 
GetUnavailableLanguages()190 vector<Guess> SimpleGuesser::GetUnavailableLanguages(){
191     return GetManagedLanguages( sal::static_int_cast< char >( 0x0F ));
192 }
193 
GetAllManagedLanguages()194 vector<Guess> SimpleGuesser::GetAllManagedLanguages(){
195     return GetManagedLanguages( sal::static_int_cast< char >( 0xFF ));
196 }
197 
XableLanguage(string lang,char mask)198 void SimpleGuesser::XableLanguage(string lang, char mask){
199     size_t i;
200     textcat_t *tables = (textcat_t*)h;
201 
202     if(!h){return;}
203 
204     for (i=0; i<tables->size; i++) {
205         string language(fp_Name(tables->fprint[i]));
206         if(start(language,lang) == 0){
207             //cout << language << endl;
208             tables->fprint_disable[i] = mask;
209             //continue;
210         }
211     }
212 }
213 
EnableLanguage(string lang)214 void SimpleGuesser::EnableLanguage(string lang){
215     XableLanguage(lang,  sal::static_int_cast< char >( 0xF0 ));
216 }
217 
DisableLanguage(string lang)218 void SimpleGuesser::DisableLanguage(string lang){
219     XableLanguage(lang,  sal::static_int_cast< char >( 0x0F ));
220 }
221 
222 /**
223 *
224 */
SetDBPath(const char * path,const char * prefix)225 void SimpleGuesser::SetDBPath(const char* path, const char* prefix){
226     if(h){
227         textcat_Done(h);
228     }
229     h = special_textcat_Init(path, prefix);
230 }
231