1*b0844812SAndrew Rist /************************************************************** 2cdf0e10cSrcweir * 3*b0844812SAndrew Rist * Licensed to the Apache Software Foundation (ASF) under one 4*b0844812SAndrew Rist * or more contributor license agreements. See the NOTICE file 5*b0844812SAndrew Rist * distributed with this work for additional information 6*b0844812SAndrew Rist * regarding copyright ownership. The ASF licenses this file 7*b0844812SAndrew Rist * to you under the Apache License, Version 2.0 (the 8*b0844812SAndrew Rist * "License"); you may not use this file except in compliance 9*b0844812SAndrew Rist * with the License. You may obtain a copy of the License at 10*b0844812SAndrew Rist * 11*b0844812SAndrew Rist * http://www.apache.org/licenses/LICENSE-2.0 12*b0844812SAndrew Rist * 13*b0844812SAndrew Rist * Unless required by applicable law or agreed to in writing, 14*b0844812SAndrew Rist * software distributed under the License is distributed on an 15*b0844812SAndrew Rist * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16*b0844812SAndrew Rist * KIND, either express or implied. See the License for the 17*b0844812SAndrew Rist * specific language governing permissions and limitations 18*b0844812SAndrew Rist * under the License. 19*b0844812SAndrew Rist * 20*b0844812SAndrew Rist *************************************************************/ 21*b0844812SAndrew Rist 22*b0844812SAndrew Rist 23cdf0e10cSrcweir 24cdf0e10cSrcweir /** 25cdf0e10cSrcweir * 26cdf0e10cSrcweir * 27cdf0e10cSrcweir * 28cdf0e10cSrcweir * 29cdf0e10cSrcweir * TODO 30cdf0e10cSrcweir * - Add exception throwing when h == NULL 31cdf0e10cSrcweir * - Not init h when implicit constructor is launched 32cdf0e10cSrcweir */ 33cdf0e10cSrcweir 34cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove 35cdf0e10cSrcweir #include "precompiled_lingucomponent.hxx" 36cdf0e10cSrcweir 37cdf0e10cSrcweir #include <string.h> 38cdf0e10cSrcweir #include <sstream> 39cdf0e10cSrcweir #include <iostream> 40cdf0e10cSrcweir 41cdf0e10cSrcweir #include <libtextcat/textcat.h> 42cdf0e10cSrcweir #include <libtextcat/common.h> 43cdf0e10cSrcweir #include <libtextcat/constants.h> 44cdf0e10cSrcweir #include <libtextcat/fingerprint.h> 45cdf0e10cSrcweir #include <libtextcat/utf8misc.h> 46cdf0e10cSrcweir 47cdf0e10cSrcweir #include <sal/types.h> 48cdf0e10cSrcweir 49cdf0e10cSrcweir #include "altstrfunc.hxx" 50cdf0e10cSrcweir #include "simpleguesser.hxx" 51cdf0e10cSrcweir 52cdf0e10cSrcweir #ifndef _UTF8_ 53cdf0e10cSrcweir #define _UTF8_ 54cdf0e10cSrcweir #endif 55cdf0e10cSrcweir 56cdf0e10cSrcweir 57cdf0e10cSrcweir using namespace std; 58cdf0e10cSrcweir 59cdf0e10cSrcweir 60cdf0e10cSrcweir /** 61cdf0e10cSrcweir * This 3 following structures are from fingerprint.c and textcat.c 62cdf0e10cSrcweir */ 63cdf0e10cSrcweir 64cdf0e10cSrcweir typedef struct ngram_t { 65cdf0e10cSrcweir 66cdf0e10cSrcweir sint2 rank; 67cdf0e10cSrcweir char str[MAXNGRAMSIZE+1]; 68cdf0e10cSrcweir 69cdf0e10cSrcweir } ngram_t; 70cdf0e10cSrcweir 71cdf0e10cSrcweir typedef struct fp_t { 72cdf0e10cSrcweir 73cdf0e10cSrcweir const char *name; 74cdf0e10cSrcweir ngram_t *fprint; 75cdf0e10cSrcweir uint4 size; 76cdf0e10cSrcweir 77cdf0e10cSrcweir } fp_t; 78cdf0e10cSrcweir 79cdf0e10cSrcweir typedef struct textcat_t{ 80cdf0e10cSrcweir 81cdf0e10cSrcweir void **fprint; 82cdf0e10cSrcweir char *fprint_disable; 83cdf0e10cSrcweir uint4 size; 84cdf0e10cSrcweir uint4 maxsize; 85cdf0e10cSrcweir 86cdf0e10cSrcweir char output[MAXOUTPUTSIZE]; 87cdf0e10cSrcweir 88cdf0e10cSrcweir } textcat_t; 89cdf0e10cSrcweir /** end of the 3 structs */ 90cdf0e10cSrcweir 91cdf0e10cSrcweir SimpleGuesser::SimpleGuesser() 92cdf0e10cSrcweir { 93cdf0e10cSrcweir h = NULL; 94cdf0e10cSrcweir } 95cdf0e10cSrcweir 96cdf0e10cSrcweir void SimpleGuesser::operator=(SimpleGuesser& sg){ 97cdf0e10cSrcweir if(h){textcat_Done(h);} 98cdf0e10cSrcweir h = sg.h; 99cdf0e10cSrcweir } 100cdf0e10cSrcweir 101cdf0e10cSrcweir SimpleGuesser::~SimpleGuesser() 102cdf0e10cSrcweir { 103cdf0e10cSrcweir if(h){textcat_Done(h);} 104cdf0e10cSrcweir } 105cdf0e10cSrcweir 106cdf0e10cSrcweir 107cdf0e10cSrcweir /*! 108cdf0e10cSrcweir \fn SimpleGuesser::GuessLanguage(char* text) 109cdf0e10cSrcweir */ 110cdf0e10cSrcweir vector<Guess> SimpleGuesser::GuessLanguage(char* text) 111cdf0e10cSrcweir { 112cdf0e10cSrcweir vector<Guess> guesses; 113cdf0e10cSrcweir 114cdf0e10cSrcweir if(!h){return guesses;} 115cdf0e10cSrcweir 116cdf0e10cSrcweir //calculate le number of unicode charcters (symbols) 117cdf0e10cSrcweir int len = utfstrlen(text); 118cdf0e10cSrcweir 119cdf0e10cSrcweir if( len > MAX_STRING_LENGTH_TO_ANALYSE ){len = MAX_STRING_LENGTH_TO_ANALYSE ;} 120cdf0e10cSrcweir 121cdf0e10cSrcweir char *guess_list = textcat_Classify(h, text, len); 122cdf0e10cSrcweir 123cdf0e10cSrcweir if(strcmp(guess_list, _TEXTCAT_RESULT_SHORT) == 0){ 124cdf0e10cSrcweir return guesses; 125cdf0e10cSrcweir } 126cdf0e10cSrcweir 127cdf0e10cSrcweir int current_pointer = 0; 128cdf0e10cSrcweir 129cdf0e10cSrcweir for(int i = 0; guess_list[current_pointer] != '\0'; i++) 130cdf0e10cSrcweir { 131cdf0e10cSrcweir while(guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0'){ 132cdf0e10cSrcweir current_pointer++; 133cdf0e10cSrcweir } 134cdf0e10cSrcweir if(guess_list[current_pointer] != '\0') 135cdf0e10cSrcweir { 136cdf0e10cSrcweir Guess g((char*)(guess_list + current_pointer)); 137cdf0e10cSrcweir 138cdf0e10cSrcweir guesses.push_back(g); 139cdf0e10cSrcweir 140cdf0e10cSrcweir current_pointer++; 141cdf0e10cSrcweir } 142cdf0e10cSrcweir } 143cdf0e10cSrcweir 144cdf0e10cSrcweir return guesses; 145cdf0e10cSrcweir } 146cdf0e10cSrcweir 147cdf0e10cSrcweir /*! 148cdf0e10cSrcweir \fn SimpleGuesser::GuessPrimaryLanguage(char* text) 149cdf0e10cSrcweir */ 150cdf0e10cSrcweir Guess SimpleGuesser::GuessPrimaryLanguage(char* text) 151cdf0e10cSrcweir { 152cdf0e10cSrcweir vector<Guess> ret = GuessLanguage(text); 153cdf0e10cSrcweir if(ret.size() > 0){ 154cdf0e10cSrcweir return GuessLanguage(text)[0]; 155cdf0e10cSrcweir } 156cdf0e10cSrcweir else{ 157cdf0e10cSrcweir return Guess(); 158cdf0e10cSrcweir } 159cdf0e10cSrcweir } 160cdf0e10cSrcweir /** 161cdf0e10cSrcweir * Is used to know wich language is available, unavailable or both 162cdf0e10cSrcweir * when mask = 0xF0, return only Available 163cdf0e10cSrcweir * when mask = 0x0F, return only Unavailable 164cdf0e10cSrcweir * when mask = 0xFF, return both Available and Unavailable 165cdf0e10cSrcweir */ 166cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask) 167cdf0e10cSrcweir { 168cdf0e10cSrcweir size_t i; 169cdf0e10cSrcweir textcat_t *tables = (textcat_t*)h; 170cdf0e10cSrcweir 171cdf0e10cSrcweir vector<Guess> lang; 172cdf0e10cSrcweir if(!h){return lang;} 173cdf0e10cSrcweir 174cdf0e10cSrcweir for (i=0; i<tables->size; i++) { 175cdf0e10cSrcweir if(tables->fprint_disable[i] & mask){ 176cdf0e10cSrcweir string langStr = "["; 177cdf0e10cSrcweir langStr += (char*)fp_Name(tables->fprint[i]); 178cdf0e10cSrcweir Guess g( (char *)langStr.c_str()); 179cdf0e10cSrcweir lang.push_back(g); 180cdf0e10cSrcweir } 181cdf0e10cSrcweir } 182cdf0e10cSrcweir 183cdf0e10cSrcweir return lang; 184cdf0e10cSrcweir } 185cdf0e10cSrcweir 186cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetAvailableLanguages(){ 187cdf0e10cSrcweir return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) ); 188cdf0e10cSrcweir } 189cdf0e10cSrcweir 190cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetUnavailableLanguages(){ 191cdf0e10cSrcweir return GetManagedLanguages( sal::static_int_cast< char >( 0x0F )); 192cdf0e10cSrcweir } 193cdf0e10cSrcweir 194cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetAllManagedLanguages(){ 195cdf0e10cSrcweir return GetManagedLanguages( sal::static_int_cast< char >( 0xFF )); 196cdf0e10cSrcweir } 197cdf0e10cSrcweir 198cdf0e10cSrcweir void SimpleGuesser::XableLanguage(string lang, char mask){ 199cdf0e10cSrcweir size_t i; 200cdf0e10cSrcweir textcat_t *tables = (textcat_t*)h; 201cdf0e10cSrcweir 202cdf0e10cSrcweir if(!h){return;} 203cdf0e10cSrcweir 204cdf0e10cSrcweir for (i=0; i<tables->size; i++) { 205cdf0e10cSrcweir string language(fp_Name(tables->fprint[i])); 206cdf0e10cSrcweir if(start(language,lang) == 0){ 207cdf0e10cSrcweir //cout << language << endl; 208cdf0e10cSrcweir tables->fprint_disable[i] = mask; 209cdf0e10cSrcweir //continue; 210cdf0e10cSrcweir } 211cdf0e10cSrcweir } 212cdf0e10cSrcweir } 213cdf0e10cSrcweir 214cdf0e10cSrcweir void SimpleGuesser::EnableLanguage(string lang){ 215cdf0e10cSrcweir XableLanguage(lang, sal::static_int_cast< char >( 0xF0 )); 216cdf0e10cSrcweir } 217cdf0e10cSrcweir 218cdf0e10cSrcweir void SimpleGuesser::DisableLanguage(string lang){ 219cdf0e10cSrcweir XableLanguage(lang, sal::static_int_cast< char >( 0x0F )); 220cdf0e10cSrcweir } 221cdf0e10cSrcweir 222cdf0e10cSrcweir /** 223cdf0e10cSrcweir * 224cdf0e10cSrcweir */ 225cdf0e10cSrcweir void SimpleGuesser::SetDBPath(const char* path, const char* prefix){ 226cdf0e10cSrcweir if(h){ 227cdf0e10cSrcweir textcat_Done(h); 228cdf0e10cSrcweir } 229cdf0e10cSrcweir h = special_textcat_Init(path, prefix); 230cdf0e10cSrcweir } 231