View file File name : hunspell_suggest.py Content :# -*- coding: utf-8 -*- # vim:et sts=4 sw=4 # # ibus-typing-booster - A completion input method for IBus # # Copyright (c) 2012-2013 Anish Patil <apatil@redhat.com> # Copyright (c) 2012-2016 Mike FABIAN <mfabian@redhat.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/> '''A module used by ibus-typing-booster to suggest words by using the hunspell dictonaries. ''' import os import sys import unicodedata import re import traceback import itb_util DEBUG_LEVEL = int(0) IMPORT_ENCHANT_SUCCESSFUL = False IMPORT_HUNSPELL_SUCCESSFUL = False try: import enchant IMPORT_ENCHANT_SUCCESSFUL = True except (ImportError,): try: import hunspell IMPORT_HUNSPELL_SUCCESSFUL = True except (ImportError,): pass # Maximum words that should be returned. # This should a rather big number in order not # to throw away useful matches. But making it very huge # makes the performance worse. For example when setting # it to 1000, I see a noticable delay when typing the first # letter of a word until the candidate lookup table pops up. MAX_WORDS = 100 class Dictionary: '''A class to hold a hunspell dictionary ''' def __init__(self, name = 'en_US'): if DEBUG_LEVEL > 1: sys.stderr.write( "Dictionary.__init__(name=%s)\n" %name) self.name = name self.dic_path = '' self.encoding = 'UTF-8' self.words = [] self.word_pairs = [] self.max_word_len = 0 # maximum length of words in this dictionary self.enchant_dict = None self.pyhunspell_object = None self.load_dictionary() def load_dictionary(self): '''Load a hunspell dictionary and instantiate a enchant.Dict() or a hunspell.Hunspell() object. ''' if DEBUG_LEVEL > 0: sys.stderr.write("load_dictionary() ...\n") (self.dic_path, self.encoding, self.words) = itb_util.get_hunspell_dictionary_wordlist(self.name) if self.words: # List of languages where accent insensitive matching makes sense: accent_languages = ( 'af', 'ast', 'az', 'be', 'bg', 'br', 'bs', 'ca', 'cs', 'csb', 'cv', 'cy', 'da', 'de', 'dsb', 'el', 'en', 'es', 'eu', 'fo', 'fr', 'fur', 'fy', 'ga', 'gd', 'gl', 'grc', 'gv', 'haw', 'hr', 'hsb', 'ht', 'hu', 'ia', 'is', 'it', 'kk', 'ku', 'ky', 'lb', 'ln', 'lv', 'mg', 'mi', 'mk', 'mn', 'mos', 'mt', 'nb', 'nds', 'nl', 'nn', 'nr', 'nso', 'ny', 'oc', 'pl', 'plt', 'pt', 'qu', 'quh', 'ru', 'sc', 'se', 'sh', 'shs', 'sk', 'sl', 'smj', 'sq', 'sr', 'ss', 'st', 'sv', 'tet', 'tk', 'tn', 'ts', 'uk', 'uz', 've', 'vi', 'wa', 'xh', ) if self.name.split('_')[0] in accent_languages: self.word_pairs = [ (x, itb_util.remove_accents(x)) for x in self.words ] for x in self.words: if len(x) > self.max_word_len: self.max_word_len = len(x) if DEBUG_LEVEL > 1: sys.stderr.write( 'load_dictionary() max_word_len = %s\n' % self.max_word_len) if IMPORT_ENCHANT_SUCCESSFUL: self.enchant_dict = enchant.Dict(self.name) elif IMPORT_HUNSPELL_SUCCESSFUL and self.dic_path: aff_path = self.dic_path.replace('.dic', '.aff') self.pyhunspell_object = hunspell.HunSpell(self.dic_path, aff_path) class Hunspell: '''A class to suggest completions or corrections using a list of Hunspell dictionaries ''' def __init__(self, dictionary_names=()): global DEBUG_LEVEL try: DEBUG_LEVEL = int(os.getenv('IBUS_TYPING_BOOSTER_DEBUG_LEVEL')) except (TypeError, ValueError): DEBUG_LEVEL = int(0) if DEBUG_LEVEL > 1: if dictionary_names: sys.stderr.write( 'Hunspell.__init__(dictionary_names=%s)\n' %dictionary_names) else: sys.stderr.write( 'Hunspell.__init__(dictionary_names=())\n') self._suggest_cache = {} self._dictionary_names = dictionary_names self._dictionaries = [] self.init_dictionaries() def init_dictionaries(self): '''Initialize the hunspell dictionaries ''' if DEBUG_LEVEL > 1: if self._dictionary_names: sys.stderr.write( 'Hunspell.init_dictionaries() dictionary_names=%s\n' %self._dictionary_names) else: sys.stderr.write( 'Hunspell.init_dictionaries() dictionary_names=()\n') self._suggest_cache = {} self._dictionaries = [] for dictionary_name in self._dictionary_names: self._dictionaries.append(Dictionary(name=dictionary_name)) def get_dictionary_names(self): '''Returns a copy of the list of dictionary names. It is important to return a copy, we do not want to change the private member variable directly.''' return self._dictionary_names[:] def set_dictionary_names(self, dictionary_names): '''Sets the list of dictionary names. If the new list of dictionary names differs from the existing one, re-initilize the dictionaries. ''' if dictionary_names != self._dictionary_names: self._dictionary_names = dictionary_names self.init_dictionaries() def suggest(self, input_phrase): '''Return completions or corrections for the input phrase :param input_phrase: A string to find completions or corrections for :type input_phrase: String :rtype: A list of tuples of the form (<word>, <score>) <score> can have these values: 0: This is a completion, i.e. input_phrase matches the beginning of <word> (accent insensitive match) -1: This is a spell checking correction from hunspell (i.e. either from enchant or pyhunspell) Examples: (Attention, the return values are in internal normalization form ('NFD')) >>> h = Hunspell(['de_DE', 'cs_CZ']) >>> h.suggest('Geschwindigkeitsubertre')[0] ('Geschwindigkeitsübertretungsverfahren', 0) >>> h.suggest('Geschwindigkeitsübertretungsverfahren')[0] ('Geschwindigkeitsübertretungsverfahren', 0) >>> h.suggest('Glühwürmchen')[0] ('Glühwürmchen', 0) >>> h.suggest('Alpengluhen')[0] ('Alpenglühen', 0) >>> h.suggest('filosofictejsi') [('filosofičtější', 0), ('filosofičtěji', -1)] >>> h.suggest('filosofictejs')[0] ('filosofičtější', 0) >>> h.suggest('filosofičtější')[0] ('filosofičtější', 0) >>> h.suggest('filosofičtějš')[0] ('filosofičtější', 0) >>> h = Hunspell(['it_IT']) >>> h.suggest('principianti') [('principianti', 0), ('principiati', -1), ('principiante', -1), ('principiarti', -1), ('principiasti', -1)] >>> h = Hunspell(['es_ES']) >>> h.suggest('teneis') [('tenéis', 0), ('tenes', -1), ('tenis', -1), ('teneos', -1), ('tienes', -1), ('te neis', -1), ('te-neis', -1)] >>> h.suggest('tenéis')[0] ('tenéis', 0) ''' if input_phrase in self._suggest_cache: return self._suggest_cache[input_phrase] if DEBUG_LEVEL > 1: sys.stderr.write( "Hunspell.suggest() input_phrase=%(ip)s\n" %{'ip': input_phrase.encode('UTF-8')}) # http://pwet.fr/man/linux/fichiers_speciaux/hunspell says: # # > A dictionary file (*.dic) contains a list of words, one per # > line. The first line of the dictionaries (except personal # > dictionaries) contains the word count. Each word may # > optionally be followed by a slash ("/") and one or more # > flags, which represents affixes or special attributes. # # I.e. if '/' is already contained in the input, it cannot # match a word in the dictionary and we return an empty list # immediately: if '/' in input_phrase: self._suggest_cache[input_phrase] = [] return [] # make sure input_phrase is in the internal normalization form (NFD): input_phrase = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, input_phrase) input_phrase_no_accents = unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, itb_util.remove_accents(input_phrase)) # But enchant and pyhunspell want NFC as input, make a copy in NFC: input_phrase_nfc = unicodedata.normalize('NFC', input_phrase) suggested_words = {} for dictionary in self._dictionaries: if dictionary.words: # If the input phrase is longer than than the maximum # word length in a dictionary, don’t try # complete it, it just wastes time then. if len(input_phrase) <= dictionary.max_word_len: if dictionary.word_pairs: suggested_words.update([ (x[0], 0) for x in dictionary.word_pairs if x[1].startswith(input_phrase_no_accents)]) else: suggested_words.update([ (x, 0) for x in dictionary.words if x.startswith(input_phrase)]) if dictionary.enchant_dict: if len(input_phrase) >= 4: # Always pass NFC to enchant and convert the # result back to the internal normalization # form (NFD) (enchant does the right thing for # Korean if the input is NFC). enchant takes # unicode strings and returns unicode strings, # no encoding and decoding to and from the # hunspell dictionary encoding is necessary # (neither for Python2 nor Python3). # (pyhunspell needs to get its input passed # in dictionary encoding and also returns it # in dictionary encoding). if dictionary.enchant_dict.check(input_phrase_nfc): # This is a valid word in this dictionary. # It might have been missed by the matching # above because the dictionary might not # contain all possible word forms (The # prefix and suffix information has been # ignored). But hunspell knows about this, # if hunspell thinks it is a correct word, # it must be counted as a match of course: suggested_words[input_phrase] = 0 extra_suggestions = [ unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, x) for x in dictionary.enchant_dict.suggest(input_phrase_nfc) ] suggested_words.update([ (suggestion, -1) for suggestion in extra_suggestions if suggestion not in suggested_words]) elif dictionary.pyhunspell_object: if len(input_phrase) >= 4: # Always pass NFC to pyhunspell and convert # the result back to the internal # normalization form (NFD) (hunspell does the # right thing for Korean if the input is NFC). if dictionary.pyhunspell_object.spell( input_phrase_nfc.encode( dictionary.encoding, 'replace')): # This is a valid word in this dictionary. # It might have been missed by the matching # above because the dictionary might not # contain all possible word forms (The # prefix and suffix information has been # ignored). But hunspell knows about this, # if hunspell thinks it is a correct word, # it must be counted as a match of course: suggested_words[input_phrase] = 0 extra_suggestions = [ unicodedata.normalize( itb_util.NORMALIZATION_FORM_INTERNAL, x) for x in dictionary.pyhunspell_object.suggest( input_phrase_nfc.encode( dictionary.encoding, 'replace')) ] suggested_words.update([ (suggestion, -1) for suggestion in extra_suggestions if suggestion not in suggested_words]) else: if (dictionary.name[:2] not in ('ja', 'ja_JP', 'zh', 'zh_CN', 'zh_TW', 'zh_MO', 'zh_SG')): # For some languages, hunspell dictionaries don’t # exist because hunspell makes no sense for these # languages. In these cases, just ignore that the # hunspell dictionary is missing. With the # appropriate input method added, emoji can be # matched nevertheless. suggested_words.update([ ('☹ %(name)s dictionary not found. ' %{'name': dictionary.name} + 'Please install hunspell dictionary!', 0)]) for word in suggested_words: if (suggested_words[word] == -1 and itb_util.remove_accents(word) == itb_util.remove_accents(input_phrase)): # This spell checking correction is actually even # an accent insensitive match, adjust accordingly: suggested_words[word] = 0 sorted_suggestions = sorted( suggested_words.items(), key = lambda x: ( - x[1], # 0: in dictionary, -1: hunspell len(x[0]), # length of word ascending x[0], # alphabetical ))[0:MAX_WORDS] self._suggest_cache[input_phrase] = sorted_suggestions return sorted_suggestions BENCHMARK = True def main(): ''' Used for testing and profiling. “python3 hunspell_suggest.py” runs some tests and prints profiling data. ''' if BENCHMARK: import cProfile, pstats profile = cProfile.Profile() profile.enable() import doctest (failed, attempted) = doctest.testmod() if BENCHMARK: profile.disable() stats = pstats.Stats(profile) stats.strip_dirs() stats.sort_stats('cumulative') stats.print_stats('hunspell', 25) stats.print_stats('enchant', 25) if failed: sys.exit(1) else: sys.exit(0) if __name__ == "__main__": main()