1 /*************************************************************************** 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 /** 29 * 30 * 31 * 32 * 33 * TODO 34 * - Add exception throwing when h == NULL 35 * - Not init h when implicit constructor is launched 36 */ 37 38 // MARKER(update_precomp.py): autogen include statement, do not remove 39 #include "precompiled_lingucomponent.hxx" 40 41 #include <string.h> 42 #include <sstream> 43 #include <iostream> 44 45 #include <libtextcat/textcat.h> 46 #include <libtextcat/common.h> 47 #include <libtextcat/constants.h> 48 #include <libtextcat/fingerprint.h> 49 #include <libtextcat/utf8misc.h> 50 51 #include <sal/types.h> 52 53 #include "altstrfunc.hxx" 54 #include "simpleguesser.hxx" 55 56 #ifndef _UTF8_ 57 #define _UTF8_ 58 #endif 59 60 61 using namespace std; 62 63 64 /** 65 * This 3 following structures are from fingerprint.c and textcat.c 66 */ 67 68 typedef struct ngram_t { 69 70 sint2 rank; 71 char str[MAXNGRAMSIZE+1]; 72 73 } ngram_t; 74 75 typedef struct fp_t { 76 77 const char *name; 78 ngram_t *fprint; 79 uint4 size; 80 81 } fp_t; 82 83 typedef struct textcat_t{ 84 85 void **fprint; 86 char *fprint_disable; 87 uint4 size; 88 uint4 maxsize; 89 90 char output[MAXOUTPUTSIZE]; 91 92 } textcat_t; 93 /** end of the 3 structs */ 94 95 SimpleGuesser::SimpleGuesser() 96 { 97 h = NULL; 98 } 99 100 void SimpleGuesser::operator=(SimpleGuesser& sg){ 101 if(h){textcat_Done(h);} 102 h = sg.h; 103 } 104 105 SimpleGuesser::~SimpleGuesser() 106 { 107 if(h){textcat_Done(h);} 108 } 109 110 111 /*! 112 \fn SimpleGuesser::GuessLanguage(char* text) 113 */ 114 vector<Guess> SimpleGuesser::GuessLanguage(char* text) 115 { 116 vector<Guess> guesses; 117 118 if(!h){return guesses;} 119 120 //calculate le number of unicode charcters (symbols) 121 int len = utfstrlen(text); 122 123 if( len > MAX_STRING_LENGTH_TO_ANALYSE ){len = MAX_STRING_LENGTH_TO_ANALYSE ;} 124 125 char *guess_list = textcat_Classify(h, text, len); 126 127 if(strcmp(guess_list, _TEXTCAT_RESULT_SHORT) == 0){ 128 return guesses; 129 } 130 131 int current_pointer = 0; 132 133 for(int i = 0; guess_list[current_pointer] != '\0'; i++) 134 { 135 while(guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0'){ 136 current_pointer++; 137 } 138 if(guess_list[current_pointer] != '\0') 139 { 140 Guess g((char*)(guess_list + current_pointer)); 141 142 guesses.push_back(g); 143 144 current_pointer++; 145 } 146 } 147 148 return guesses; 149 } 150 151 /*! 152 \fn SimpleGuesser::GuessPrimaryLanguage(char* text) 153 */ 154 Guess SimpleGuesser::GuessPrimaryLanguage(char* text) 155 { 156 vector<Guess> ret = GuessLanguage(text); 157 if(ret.size() > 0){ 158 return GuessLanguage(text)[0]; 159 } 160 else{ 161 return Guess(); 162 } 163 } 164 /** 165 * Is used to know wich language is available, unavailable or both 166 * when mask = 0xF0, return only Available 167 * when mask = 0x0F, return only Unavailable 168 * when mask = 0xFF, return both Available and Unavailable 169 */ 170 vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask) 171 { 172 size_t i; 173 textcat_t *tables = (textcat_t*)h; 174 175 vector<Guess> lang; 176 if(!h){return lang;} 177 178 for (i=0; i<tables->size; i++) { 179 if(tables->fprint_disable[i] & mask){ 180 string langStr = "["; 181 langStr += (char*)fp_Name(tables->fprint[i]); 182 Guess g( (char *)langStr.c_str()); 183 lang.push_back(g); 184 } 185 } 186 187 return lang; 188 } 189 190 vector<Guess> SimpleGuesser::GetAvailableLanguages(){ 191 return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) ); 192 } 193 194 vector<Guess> SimpleGuesser::GetUnavailableLanguages(){ 195 return GetManagedLanguages( sal::static_int_cast< char >( 0x0F )); 196 } 197 198 vector<Guess> SimpleGuesser::GetAllManagedLanguages(){ 199 return GetManagedLanguages( sal::static_int_cast< char >( 0xFF )); 200 } 201 202 void SimpleGuesser::XableLanguage(string lang, char mask){ 203 size_t i; 204 textcat_t *tables = (textcat_t*)h; 205 206 if(!h){return;} 207 208 for (i=0; i<tables->size; i++) { 209 string language(fp_Name(tables->fprint[i])); 210 if(start(language,lang) == 0){ 211 //cout << language << endl; 212 tables->fprint_disable[i] = mask; 213 //continue; 214 } 215 } 216 } 217 218 void SimpleGuesser::EnableLanguage(string lang){ 219 XableLanguage(lang, sal::static_int_cast< char >( 0xF0 )); 220 } 221 222 void SimpleGuesser::DisableLanguage(string lang){ 223 XableLanguage(lang, sal::static_int_cast< char >( 0x0F )); 224 } 225 226 /** 227 * 228 */ 229 void SimpleGuesser::SetDBPath(const char* path, const char* prefix){ 230 if(h){ 231 textcat_Done(h); 232 } 233 h = special_textcat_Init(path, prefix); 234 } 235