1b0844812SAndrew Rist /**************************************************************
2cdf0e10cSrcweir *
3b0844812SAndrew Rist * Licensed to the Apache Software Foundation (ASF) under one
4b0844812SAndrew Rist * or more contributor license agreements. See the NOTICE file
5b0844812SAndrew Rist * distributed with this work for additional information
6b0844812SAndrew Rist * regarding copyright ownership. The ASF licenses this file
7b0844812SAndrew Rist * to you under the Apache License, Version 2.0 (the
8b0844812SAndrew Rist * "License"); you may not use this file except in compliance
9b0844812SAndrew Rist * with the License. You may obtain a copy of the License at
10b0844812SAndrew Rist *
11b0844812SAndrew Rist * http://www.apache.org/licenses/LICENSE-2.0
12b0844812SAndrew Rist *
13b0844812SAndrew Rist * Unless required by applicable law or agreed to in writing,
14b0844812SAndrew Rist * software distributed under the License is distributed on an
15b0844812SAndrew Rist * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16b0844812SAndrew Rist * KIND, either express or implied. See the License for the
17b0844812SAndrew Rist * specific language governing permissions and limitations
18b0844812SAndrew Rist * under the License.
19b0844812SAndrew Rist *
20b0844812SAndrew Rist *************************************************************/
21b0844812SAndrew Rist
22b0844812SAndrew Rist
23cdf0e10cSrcweir
24cdf0e10cSrcweir /**
25cdf0e10cSrcweir *
26cdf0e10cSrcweir *
27cdf0e10cSrcweir *
28cdf0e10cSrcweir *
29cdf0e10cSrcweir * TODO
30cdf0e10cSrcweir * - Add exception throwing when h == NULL
31cdf0e10cSrcweir * - Not init h when implicit constructor is launched
32cdf0e10cSrcweir */
33cdf0e10cSrcweir
34cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove
35*c4c42a0eSDamjan Jovanovic #include "precompiled_guesslang.hxx"
36cdf0e10cSrcweir
37cdf0e10cSrcweir #include <string.h>
38cdf0e10cSrcweir #include <sstream>
39cdf0e10cSrcweir #include <iostream>
40cdf0e10cSrcweir
41cdf0e10cSrcweir #include <libtextcat/textcat.h>
42cdf0e10cSrcweir #include <libtextcat/common.h>
43cdf0e10cSrcweir #include <libtextcat/constants.h>
44cdf0e10cSrcweir #include <libtextcat/fingerprint.h>
45cdf0e10cSrcweir #include <libtextcat/utf8misc.h>
46cdf0e10cSrcweir
47cdf0e10cSrcweir #include <sal/types.h>
48cdf0e10cSrcweir
49cdf0e10cSrcweir #include "altstrfunc.hxx"
50cdf0e10cSrcweir #include "simpleguesser.hxx"
51cdf0e10cSrcweir
52cdf0e10cSrcweir #ifndef _UTF8_
53cdf0e10cSrcweir #define _UTF8_
54cdf0e10cSrcweir #endif
55cdf0e10cSrcweir
56cdf0e10cSrcweir
57cdf0e10cSrcweir using namespace std;
58cdf0e10cSrcweir
59cdf0e10cSrcweir
60cdf0e10cSrcweir /**
61cdf0e10cSrcweir * This 3 following structures are from fingerprint.c and textcat.c
62cdf0e10cSrcweir */
63cdf0e10cSrcweir
64cdf0e10cSrcweir typedef struct ngram_t {
65cdf0e10cSrcweir
66cdf0e10cSrcweir sint2 rank;
67cdf0e10cSrcweir char str[MAXNGRAMSIZE+1];
68cdf0e10cSrcweir
69cdf0e10cSrcweir } ngram_t;
70cdf0e10cSrcweir
71cdf0e10cSrcweir typedef struct fp_t {
72cdf0e10cSrcweir
73cdf0e10cSrcweir const char *name;
74cdf0e10cSrcweir ngram_t *fprint;
75cdf0e10cSrcweir uint4 size;
76cdf0e10cSrcweir
77cdf0e10cSrcweir } fp_t;
78cdf0e10cSrcweir
79cdf0e10cSrcweir typedef struct textcat_t{
80cdf0e10cSrcweir
81cdf0e10cSrcweir void **fprint;
82cdf0e10cSrcweir char *fprint_disable;
83cdf0e10cSrcweir uint4 size;
84cdf0e10cSrcweir uint4 maxsize;
85cdf0e10cSrcweir
86cdf0e10cSrcweir char output[MAXOUTPUTSIZE];
87cdf0e10cSrcweir
88cdf0e10cSrcweir } textcat_t;
89cdf0e10cSrcweir /** end of the 3 structs */
90cdf0e10cSrcweir
SimpleGuesser()91cdf0e10cSrcweir SimpleGuesser::SimpleGuesser()
92cdf0e10cSrcweir {
93cdf0e10cSrcweir h = NULL;
94cdf0e10cSrcweir }
95cdf0e10cSrcweir
operator =(SimpleGuesser & sg)96cdf0e10cSrcweir void SimpleGuesser::operator=(SimpleGuesser& sg){
97cdf0e10cSrcweir if(h){textcat_Done(h);}
98cdf0e10cSrcweir h = sg.h;
99cdf0e10cSrcweir }
100cdf0e10cSrcweir
~SimpleGuesser()101cdf0e10cSrcweir SimpleGuesser::~SimpleGuesser()
102cdf0e10cSrcweir {
103cdf0e10cSrcweir if(h){textcat_Done(h);}
104cdf0e10cSrcweir }
105cdf0e10cSrcweir
106cdf0e10cSrcweir
107cdf0e10cSrcweir /*!
108cdf0e10cSrcweir \fn SimpleGuesser::GuessLanguage(char* text)
109cdf0e10cSrcweir */
GuessLanguage(char * text)110cdf0e10cSrcweir vector<Guess> SimpleGuesser::GuessLanguage(char* text)
111cdf0e10cSrcweir {
112cdf0e10cSrcweir vector<Guess> guesses;
113cdf0e10cSrcweir
114cdf0e10cSrcweir if(!h){return guesses;}
115cdf0e10cSrcweir
116cdf0e10cSrcweir //calculate le number of unicode charcters (symbols)
117cdf0e10cSrcweir int len = utfstrlen(text);
118cdf0e10cSrcweir
119cdf0e10cSrcweir if( len > MAX_STRING_LENGTH_TO_ANALYSE ){len = MAX_STRING_LENGTH_TO_ANALYSE ;}
120cdf0e10cSrcweir
121cdf0e10cSrcweir char *guess_list = textcat_Classify(h, text, len);
122cdf0e10cSrcweir
123cdf0e10cSrcweir if(strcmp(guess_list, _TEXTCAT_RESULT_SHORT) == 0){
124cdf0e10cSrcweir return guesses;
125cdf0e10cSrcweir }
126cdf0e10cSrcweir
127cdf0e10cSrcweir int current_pointer = 0;
128cdf0e10cSrcweir
129cdf0e10cSrcweir for(int i = 0; guess_list[current_pointer] != '\0'; i++)
130cdf0e10cSrcweir {
131cdf0e10cSrcweir while(guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0'){
132cdf0e10cSrcweir current_pointer++;
133cdf0e10cSrcweir }
134cdf0e10cSrcweir if(guess_list[current_pointer] != '\0')
135cdf0e10cSrcweir {
136cdf0e10cSrcweir Guess g((char*)(guess_list + current_pointer));
137cdf0e10cSrcweir
138cdf0e10cSrcweir guesses.push_back(g);
139cdf0e10cSrcweir
140cdf0e10cSrcweir current_pointer++;
141cdf0e10cSrcweir }
142cdf0e10cSrcweir }
143cdf0e10cSrcweir
144cdf0e10cSrcweir return guesses;
145cdf0e10cSrcweir }
146cdf0e10cSrcweir
147cdf0e10cSrcweir /*!
148cdf0e10cSrcweir \fn SimpleGuesser::GuessPrimaryLanguage(char* text)
149cdf0e10cSrcweir */
GuessPrimaryLanguage(char * text)150cdf0e10cSrcweir Guess SimpleGuesser::GuessPrimaryLanguage(char* text)
151cdf0e10cSrcweir {
152cdf0e10cSrcweir vector<Guess> ret = GuessLanguage(text);
153cdf0e10cSrcweir if(ret.size() > 0){
154cdf0e10cSrcweir return GuessLanguage(text)[0];
155cdf0e10cSrcweir }
156cdf0e10cSrcweir else{
157cdf0e10cSrcweir return Guess();
158cdf0e10cSrcweir }
159cdf0e10cSrcweir }
160cdf0e10cSrcweir /**
161a893be29SPedro Giffuni * Is used to know which language is available, unavailable or both
162cdf0e10cSrcweir * when mask = 0xF0, return only Available
163cdf0e10cSrcweir * when mask = 0x0F, return only Unavailable
164cdf0e10cSrcweir * when mask = 0xFF, return both Available and Unavailable
165cdf0e10cSrcweir */
GetManagedLanguages(const char mask)166cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask)
167cdf0e10cSrcweir {
168cdf0e10cSrcweir size_t i;
169cdf0e10cSrcweir textcat_t *tables = (textcat_t*)h;
170cdf0e10cSrcweir
171cdf0e10cSrcweir vector<Guess> lang;
172cdf0e10cSrcweir if(!h){return lang;}
173cdf0e10cSrcweir
174cdf0e10cSrcweir for (i=0; i<tables->size; i++) {
175cdf0e10cSrcweir if(tables->fprint_disable[i] & mask){
176cdf0e10cSrcweir string langStr = "[";
177cdf0e10cSrcweir langStr += (char*)fp_Name(tables->fprint[i]);
178cdf0e10cSrcweir Guess g( (char *)langStr.c_str());
179cdf0e10cSrcweir lang.push_back(g);
180cdf0e10cSrcweir }
181cdf0e10cSrcweir }
182cdf0e10cSrcweir
183cdf0e10cSrcweir return lang;
184cdf0e10cSrcweir }
185cdf0e10cSrcweir
GetAvailableLanguages()186cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetAvailableLanguages(){
187cdf0e10cSrcweir return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) );
188cdf0e10cSrcweir }
189cdf0e10cSrcweir
GetUnavailableLanguages()190cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetUnavailableLanguages(){
191cdf0e10cSrcweir return GetManagedLanguages( sal::static_int_cast< char >( 0x0F ));
192cdf0e10cSrcweir }
193cdf0e10cSrcweir
GetAllManagedLanguages()194cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetAllManagedLanguages(){
195cdf0e10cSrcweir return GetManagedLanguages( sal::static_int_cast< char >( 0xFF ));
196cdf0e10cSrcweir }
197cdf0e10cSrcweir
XableLanguage(string lang,char mask)198cdf0e10cSrcweir void SimpleGuesser::XableLanguage(string lang, char mask){
199cdf0e10cSrcweir size_t i;
200cdf0e10cSrcweir textcat_t *tables = (textcat_t*)h;
201cdf0e10cSrcweir
202cdf0e10cSrcweir if(!h){return;}
203cdf0e10cSrcweir
204cdf0e10cSrcweir for (i=0; i<tables->size; i++) {
205cdf0e10cSrcweir string language(fp_Name(tables->fprint[i]));
206cdf0e10cSrcweir if(start(language,lang) == 0){
207cdf0e10cSrcweir //cout << language << endl;
208cdf0e10cSrcweir tables->fprint_disable[i] = mask;
209cdf0e10cSrcweir //continue;
210cdf0e10cSrcweir }
211cdf0e10cSrcweir }
212cdf0e10cSrcweir }
213cdf0e10cSrcweir
EnableLanguage(string lang)214cdf0e10cSrcweir void SimpleGuesser::EnableLanguage(string lang){
215cdf0e10cSrcweir XableLanguage(lang, sal::static_int_cast< char >( 0xF0 ));
216cdf0e10cSrcweir }
217cdf0e10cSrcweir
DisableLanguage(string lang)218cdf0e10cSrcweir void SimpleGuesser::DisableLanguage(string lang){
219cdf0e10cSrcweir XableLanguage(lang, sal::static_int_cast< char >( 0x0F ));
220cdf0e10cSrcweir }
221cdf0e10cSrcweir
222cdf0e10cSrcweir /**
223cdf0e10cSrcweir *
224cdf0e10cSrcweir */
SetDBPath(const char * path,const char * prefix)225cdf0e10cSrcweir void SimpleGuesser::SetDBPath(const char* path, const char* prefix){
226cdf0e10cSrcweir if(h){
227cdf0e10cSrcweir textcat_Done(h);
228cdf0e10cSrcweir }
229cdf0e10cSrcweir h = special_textcat_Init(path, prefix);
230cdf0e10cSrcweir }
231