1*cdf0e10cSrcweir /*************************************************************************** 2*cdf0e10cSrcweir * 3*cdf0e10cSrcweir * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4*cdf0e10cSrcweir * 5*cdf0e10cSrcweir * Copyright 2000, 2010 Oracle and/or its affiliates. 6*cdf0e10cSrcweir * 7*cdf0e10cSrcweir * OpenOffice.org - a multi-platform office productivity suite 8*cdf0e10cSrcweir * 9*cdf0e10cSrcweir * This file is part of OpenOffice.org. 10*cdf0e10cSrcweir * 11*cdf0e10cSrcweir * OpenOffice.org is free software: you can redistribute it and/or modify 12*cdf0e10cSrcweir * it under the terms of the GNU Lesser General Public License version 3 13*cdf0e10cSrcweir * only, as published by the Free Software Foundation. 14*cdf0e10cSrcweir * 15*cdf0e10cSrcweir * OpenOffice.org is distributed in the hope that it will be useful, 16*cdf0e10cSrcweir * but WITHOUT ANY WARRANTY; without even the implied warranty of 17*cdf0e10cSrcweir * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18*cdf0e10cSrcweir * GNU Lesser General Public License version 3 for more details 19*cdf0e10cSrcweir * (a copy is included in the LICENSE file that accompanied this code). 20*cdf0e10cSrcweir * 21*cdf0e10cSrcweir * You should have received a copy of the GNU Lesser General Public License 22*cdf0e10cSrcweir * version 3 along with OpenOffice.org. If not, see 23*cdf0e10cSrcweir * <http://www.openoffice.org/license.html> 24*cdf0e10cSrcweir * for a copy of the LGPLv3 License. 25*cdf0e10cSrcweir * 26*cdf0e10cSrcweir ************************************************************************/ 27*cdf0e10cSrcweir 28*cdf0e10cSrcweir /** 29*cdf0e10cSrcweir * 30*cdf0e10cSrcweir * 31*cdf0e10cSrcweir * 32*cdf0e10cSrcweir * 33*cdf0e10cSrcweir * TODO 34*cdf0e10cSrcweir * - Add exception throwing when h == NULL 35*cdf0e10cSrcweir * - Not init h when implicit constructor is launched 36*cdf0e10cSrcweir */ 37*cdf0e10cSrcweir 38*cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove 39*cdf0e10cSrcweir #include "precompiled_lingucomponent.hxx" 40*cdf0e10cSrcweir 41*cdf0e10cSrcweir #include <string.h> 42*cdf0e10cSrcweir #include <sstream> 43*cdf0e10cSrcweir #include <iostream> 44*cdf0e10cSrcweir 45*cdf0e10cSrcweir #include <libtextcat/textcat.h> 46*cdf0e10cSrcweir #include <libtextcat/common.h> 47*cdf0e10cSrcweir #include <libtextcat/constants.h> 48*cdf0e10cSrcweir #include <libtextcat/fingerprint.h> 49*cdf0e10cSrcweir #include <libtextcat/utf8misc.h> 50*cdf0e10cSrcweir 51*cdf0e10cSrcweir #include <sal/types.h> 52*cdf0e10cSrcweir 53*cdf0e10cSrcweir #include "altstrfunc.hxx" 54*cdf0e10cSrcweir #include "simpleguesser.hxx" 55*cdf0e10cSrcweir 56*cdf0e10cSrcweir #ifndef _UTF8_ 57*cdf0e10cSrcweir #define _UTF8_ 58*cdf0e10cSrcweir #endif 59*cdf0e10cSrcweir 60*cdf0e10cSrcweir 61*cdf0e10cSrcweir using namespace std; 62*cdf0e10cSrcweir 63*cdf0e10cSrcweir 64*cdf0e10cSrcweir /** 65*cdf0e10cSrcweir * This 3 following structures are from fingerprint.c and textcat.c 66*cdf0e10cSrcweir */ 67*cdf0e10cSrcweir 68*cdf0e10cSrcweir typedef struct ngram_t { 69*cdf0e10cSrcweir 70*cdf0e10cSrcweir sint2 rank; 71*cdf0e10cSrcweir char str[MAXNGRAMSIZE+1]; 72*cdf0e10cSrcweir 73*cdf0e10cSrcweir } ngram_t; 74*cdf0e10cSrcweir 75*cdf0e10cSrcweir typedef struct fp_t { 76*cdf0e10cSrcweir 77*cdf0e10cSrcweir const char *name; 78*cdf0e10cSrcweir ngram_t *fprint; 79*cdf0e10cSrcweir uint4 size; 80*cdf0e10cSrcweir 81*cdf0e10cSrcweir } fp_t; 82*cdf0e10cSrcweir 83*cdf0e10cSrcweir typedef struct textcat_t{ 84*cdf0e10cSrcweir 85*cdf0e10cSrcweir void **fprint; 86*cdf0e10cSrcweir char *fprint_disable; 87*cdf0e10cSrcweir uint4 size; 88*cdf0e10cSrcweir uint4 maxsize; 89*cdf0e10cSrcweir 90*cdf0e10cSrcweir char output[MAXOUTPUTSIZE]; 91*cdf0e10cSrcweir 92*cdf0e10cSrcweir } textcat_t; 93*cdf0e10cSrcweir /** end of the 3 structs */ 94*cdf0e10cSrcweir 95*cdf0e10cSrcweir SimpleGuesser::SimpleGuesser() 96*cdf0e10cSrcweir { 97*cdf0e10cSrcweir h = NULL; 98*cdf0e10cSrcweir } 99*cdf0e10cSrcweir 100*cdf0e10cSrcweir void SimpleGuesser::operator=(SimpleGuesser& sg){ 101*cdf0e10cSrcweir if(h){textcat_Done(h);} 102*cdf0e10cSrcweir h = sg.h; 103*cdf0e10cSrcweir } 104*cdf0e10cSrcweir 105*cdf0e10cSrcweir SimpleGuesser::~SimpleGuesser() 106*cdf0e10cSrcweir { 107*cdf0e10cSrcweir if(h){textcat_Done(h);} 108*cdf0e10cSrcweir } 109*cdf0e10cSrcweir 110*cdf0e10cSrcweir 111*cdf0e10cSrcweir /*! 112*cdf0e10cSrcweir \fn SimpleGuesser::GuessLanguage(char* text) 113*cdf0e10cSrcweir */ 114*cdf0e10cSrcweir vector<Guess> SimpleGuesser::GuessLanguage(char* text) 115*cdf0e10cSrcweir { 116*cdf0e10cSrcweir vector<Guess> guesses; 117*cdf0e10cSrcweir 118*cdf0e10cSrcweir if(!h){return guesses;} 119*cdf0e10cSrcweir 120*cdf0e10cSrcweir //calculate le number of unicode charcters (symbols) 121*cdf0e10cSrcweir int len = utfstrlen(text); 122*cdf0e10cSrcweir 123*cdf0e10cSrcweir if( len > MAX_STRING_LENGTH_TO_ANALYSE ){len = MAX_STRING_LENGTH_TO_ANALYSE ;} 124*cdf0e10cSrcweir 125*cdf0e10cSrcweir char *guess_list = textcat_Classify(h, text, len); 126*cdf0e10cSrcweir 127*cdf0e10cSrcweir if(strcmp(guess_list, _TEXTCAT_RESULT_SHORT) == 0){ 128*cdf0e10cSrcweir return guesses; 129*cdf0e10cSrcweir } 130*cdf0e10cSrcweir 131*cdf0e10cSrcweir int current_pointer = 0; 132*cdf0e10cSrcweir 133*cdf0e10cSrcweir for(int i = 0; guess_list[current_pointer] != '\0'; i++) 134*cdf0e10cSrcweir { 135*cdf0e10cSrcweir while(guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0'){ 136*cdf0e10cSrcweir current_pointer++; 137*cdf0e10cSrcweir } 138*cdf0e10cSrcweir if(guess_list[current_pointer] != '\0') 139*cdf0e10cSrcweir { 140*cdf0e10cSrcweir Guess g((char*)(guess_list + current_pointer)); 141*cdf0e10cSrcweir 142*cdf0e10cSrcweir guesses.push_back(g); 143*cdf0e10cSrcweir 144*cdf0e10cSrcweir current_pointer++; 145*cdf0e10cSrcweir } 146*cdf0e10cSrcweir } 147*cdf0e10cSrcweir 148*cdf0e10cSrcweir return guesses; 149*cdf0e10cSrcweir } 150*cdf0e10cSrcweir 151*cdf0e10cSrcweir /*! 152*cdf0e10cSrcweir \fn SimpleGuesser::GuessPrimaryLanguage(char* text) 153*cdf0e10cSrcweir */ 154*cdf0e10cSrcweir Guess SimpleGuesser::GuessPrimaryLanguage(char* text) 155*cdf0e10cSrcweir { 156*cdf0e10cSrcweir vector<Guess> ret = GuessLanguage(text); 157*cdf0e10cSrcweir if(ret.size() > 0){ 158*cdf0e10cSrcweir return GuessLanguage(text)[0]; 159*cdf0e10cSrcweir } 160*cdf0e10cSrcweir else{ 161*cdf0e10cSrcweir return Guess(); 162*cdf0e10cSrcweir } 163*cdf0e10cSrcweir } 164*cdf0e10cSrcweir /** 165*cdf0e10cSrcweir * Is used to know wich language is available, unavailable or both 166*cdf0e10cSrcweir * when mask = 0xF0, return only Available 167*cdf0e10cSrcweir * when mask = 0x0F, return only Unavailable 168*cdf0e10cSrcweir * when mask = 0xFF, return both Available and Unavailable 169*cdf0e10cSrcweir */ 170*cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask) 171*cdf0e10cSrcweir { 172*cdf0e10cSrcweir size_t i; 173*cdf0e10cSrcweir textcat_t *tables = (textcat_t*)h; 174*cdf0e10cSrcweir 175*cdf0e10cSrcweir vector<Guess> lang; 176*cdf0e10cSrcweir if(!h){return lang;} 177*cdf0e10cSrcweir 178*cdf0e10cSrcweir for (i=0; i<tables->size; i++) { 179*cdf0e10cSrcweir if(tables->fprint_disable[i] & mask){ 180*cdf0e10cSrcweir string langStr = "["; 181*cdf0e10cSrcweir langStr += (char*)fp_Name(tables->fprint[i]); 182*cdf0e10cSrcweir Guess g( (char *)langStr.c_str()); 183*cdf0e10cSrcweir lang.push_back(g); 184*cdf0e10cSrcweir } 185*cdf0e10cSrcweir } 186*cdf0e10cSrcweir 187*cdf0e10cSrcweir return lang; 188*cdf0e10cSrcweir } 189*cdf0e10cSrcweir 190*cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetAvailableLanguages(){ 191*cdf0e10cSrcweir return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) ); 192*cdf0e10cSrcweir } 193*cdf0e10cSrcweir 194*cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetUnavailableLanguages(){ 195*cdf0e10cSrcweir return GetManagedLanguages( sal::static_int_cast< char >( 0x0F )); 196*cdf0e10cSrcweir } 197*cdf0e10cSrcweir 198*cdf0e10cSrcweir vector<Guess> SimpleGuesser::GetAllManagedLanguages(){ 199*cdf0e10cSrcweir return GetManagedLanguages( sal::static_int_cast< char >( 0xFF )); 200*cdf0e10cSrcweir } 201*cdf0e10cSrcweir 202*cdf0e10cSrcweir void SimpleGuesser::XableLanguage(string lang, char mask){ 203*cdf0e10cSrcweir size_t i; 204*cdf0e10cSrcweir textcat_t *tables = (textcat_t*)h; 205*cdf0e10cSrcweir 206*cdf0e10cSrcweir if(!h){return;} 207*cdf0e10cSrcweir 208*cdf0e10cSrcweir for (i=0; i<tables->size; i++) { 209*cdf0e10cSrcweir string language(fp_Name(tables->fprint[i])); 210*cdf0e10cSrcweir if(start(language,lang) == 0){ 211*cdf0e10cSrcweir //cout << language << endl; 212*cdf0e10cSrcweir tables->fprint_disable[i] = mask; 213*cdf0e10cSrcweir //continue; 214*cdf0e10cSrcweir } 215*cdf0e10cSrcweir } 216*cdf0e10cSrcweir } 217*cdf0e10cSrcweir 218*cdf0e10cSrcweir void SimpleGuesser::EnableLanguage(string lang){ 219*cdf0e10cSrcweir XableLanguage(lang, sal::static_int_cast< char >( 0xF0 )); 220*cdf0e10cSrcweir } 221*cdf0e10cSrcweir 222*cdf0e10cSrcweir void SimpleGuesser::DisableLanguage(string lang){ 223*cdf0e10cSrcweir XableLanguage(lang, sal::static_int_cast< char >( 0x0F )); 224*cdf0e10cSrcweir } 225*cdf0e10cSrcweir 226*cdf0e10cSrcweir /** 227*cdf0e10cSrcweir * 228*cdf0e10cSrcweir */ 229*cdf0e10cSrcweir void SimpleGuesser::SetDBPath(const char* path, const char* prefix){ 230*cdf0e10cSrcweir if(h){ 231*cdf0e10cSrcweir textcat_Done(h); 232*cdf0e10cSrcweir } 233*cdf0e10cSrcweir h = special_textcat_Init(path, prefix); 234*cdf0e10cSrcweir } 235