View file File name : m17n_translit.py Content :# -*- coding: utf-8 -*- # vim:et sts=4 sw=4 # # ibus-typing-booster - A completion input method for IBus # # Copyright (c) 2015-2016 Mike FABIAN <mfabian@redhat.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/> '''A module to do transliteration using m17n-lib. ''' import sys import ctypes class libm17n__MSymbolStruct(ctypes.Structure): pass libm17n__MSymbol = ctypes.POINTER(libm17n__MSymbolStruct) class libm17n__MPlist(ctypes.Structure): pass class libm17n__MConverter(ctypes.Structure): pass class libm17n__MInputMethod(ctypes.Structure): pass class libm17n__MInputContext(ctypes.Structure): pass class libm17n__MText(ctypes.Structure): pass libm17n__MSymbolStruct._fields_ = [ ('managing_key', ctypes.c_uint), ('name', ctypes.c_char_p), ('length', ctypes.c_int), ('plist', libm17n__MPlist), ('next', ctypes.POINTER(libm17n__MSymbolStruct))] libm17n__lib = None libm17n__msymbol = None libm17n__mplist = None libm17n__mconv_buffer_converter = None libm17n__mconv_reset_converter = None libm17n__mconv_rebind_buffer = None libm17n__mconv_encode = None libm17n__minput_open_im = None libm17n__minput_create_ic = None libm17n__minput_reset_ic = None libm17n__minput_filter = None libm17n__minput_lookup = None libm17n__mtext = None libm17n__mtext_len = None libm17n__Mcoding_utf_8 = None _utf8_converter = None def mtext_to_string(mtext_pointer): '''Return the text contained in an MText object as a Python string :param mtext_pointer: pointer to the MText object to get the text from :type mtext_pointer: pointer to an libm17n MText object :rtype: string ''' libm17n__mconv_reset_converter(_utf8_converter) # one Unicode character cannot have more than 6 UTF-8 bytes # (actually not more than 4 ...) bufsize = (libm17n__mtext_len(mtext_pointer) + 1) * 6 conversion_buffer = bytes(bufsize) libm17n__mconv_rebind_buffer( _utf8_converter, ctypes.c_char_p(conversion_buffer), ctypes.c_int(bufsize)) libm17n__mconv_encode(_utf8_converter, mtext_pointer) # maybe not all of the buffer was really used for the conversion, # cut of the unused part: conversion_buffer = conversion_buffer[0:conversion_buffer.find(b'\x00')] return conversion_buffer.decode('utf-8') def _init(): '''Open libm17n and fill global variables for functions and variables from libm17n ''' global libm17n__lib libm17n__lib = ctypes.CDLL('libm17n.so.0', mode = ctypes.RTLD_GLOBAL) libm17n__lib.m17n_init() global libm17n__mplist libm17n__mplist = libm17n__lib.mplist libm17n__mplist.argtypes = [] libm17n__mplist.restype = ctypes.POINTER(libm17n__MPlist) global libm17n__mconv_buffer_converter libm17n__mconv_buffer_converter = libm17n__lib.mconv_buffer_converter libm17n__mconv_buffer_converter.argtypes = [ libm17n__MSymbol, ctypes.c_char_p, ctypes.c_int] libm17n__mconv_buffer_converter.restype = ctypes.POINTER( libm17n__MConverter) global libm17n__mconv_reset_converter libm17n__mconv_reset_converter = libm17n__lib.mconv_reset_converter libm17n__mconv_reset_converter.argtypes = [ ctypes.POINTER(libm17n__MConverter)] libm17n__mconv_reset_converter.restype = ctypes.c_int global libm17n__mconv_rebind_buffer libm17n__mconv_rebind_buffer = libm17n__lib.mconv_rebind_buffer libm17n__mconv_rebind_buffer.argtypes = [ ctypes.POINTER(libm17n__MConverter), ctypes.c_char_p, ctypes.c_int] libm17n__mconv_rebind_buffer.restype = ctypes.POINTER(libm17n__MConverter) global libm17n__mconv_encode libm17n__mconv_encode = libm17n__lib.mconv_encode libm17n__mconv_encode.argtypes = [ ctypes.POINTER(libm17n__MConverter), ctypes.POINTER(libm17n__MText)] libm17n__mconv_encode.restype = ctypes.c_int global libm17n__msymbol libm17n__msymbol = libm17n__lib.msymbol libm17n__msymbol.argtypes = [ctypes.c_char_p] libm17n__msymbol.restype = libm17n__MSymbol global libm17n__minput_open_im libm17n__minput_open_im = libm17n__lib.minput_open_im libm17n__minput_open_im.argtypes = [ libm17n__MSymbol, libm17n__MSymbol, ctypes.c_void_p] libm17n__minput_open_im.restype = ctypes.POINTER(libm17n__MInputMethod) global libm17n__minput_create_ic libm17n__minput_create_ic = libm17n__lib.minput_create_ic libm17n__minput_create_ic.argtypes = [ ctypes.POINTER(libm17n__MInputMethod), ctypes.c_void_p] libm17n__minput_create_ic.restype = ctypes.POINTER(libm17n__MInputContext) global libm17n__minput_reset_ic libm17n__minput_reset_ic = libm17n__lib.minput_reset_ic libm17n__minput_reset_ic.argtypes = [ ctypes.POINTER(libm17n__MInputContext)] global libm17n__minput_filter libm17n__minput_filter = libm17n__lib.minput_filter libm17n__minput_filter.argtypes = [ ctypes.POINTER(libm17n__MInputContext), libm17n__MSymbol, ctypes.c_void_p] libm17n__minput_filter.restype = ctypes.c_int global libm17n__minput_lookup libm17n__minput_lookup = libm17n__lib.minput_lookup libm17n__minput_lookup.argtypes = [ ctypes.POINTER(libm17n__MInputContext), libm17n__MSymbol, ctypes.c_void_p, ctypes.POINTER(libm17n__MText)] libm17n__minput_lookup.restype = ctypes.c_int global libm17n__mtext libm17n__mtext = libm17n__lib.mtext libm17n__mtext.argtypes = [] libm17n__mtext.restype = ctypes.POINTER(libm17n__MText) global libm17n__mtext_len libm17n__mtext_len = libm17n__lib.mtext_len libm17n__mtext_len.argtypes = [ctypes.POINTER(libm17n__MText)] libm17n__mtext_len.restype = ctypes.c_int global libm17n__Mcoding_utf_8 libm17n__Mcoding_utf_8 = libm17n__MSymbol.in_dll( ctypes.pythonapi, 'Mcoding_utf_8') global _utf8_converter _utf8_converter = libm17n__mconv_buffer_converter( libm17n__Mcoding_utf_8, ctypes.c_char_p(None), ctypes.c_int(0)) def _del(): '''Cleanup''' libm17n__lib.m17n_fini() class __ModuleInitializer: def __init__(self): _init() return def __del__(self): return __module_init = __ModuleInitializer() class Transliterator: '''A class for transliterators using libm17n Examples: Russian transliteration: >>> trans = Transliterator('ru-translit') >>> trans.transliterate(list('y')) 'ы' >>> trans.transliterate(list('yo')) 'ё' >>> trans.transliterate(list('yo y')) 'ё ы' Marathi transliteration: >>> trans = Transliterator('mr-itrans') >>> trans.transliterate(list('praviN')) 'प्रविण्' >>> trans.transliterate(list('namaste')) 'नमस्ते' Hindi transliteration: >>> trans = Transliterator('hi-itrans') >>> trans.transliterate(list('namaste')) 'नमस्ते' >>> trans.transliterate(list('. ')) '। ' Hindi-Inscript2 uses the AltGr key a lot, 'G-4' is the MSymbol name for AltGr-4 and it transliterates to something different than just '4': >>> trans = Transliterator('hi-inscript2') >>> trans.transliterate(['4', 'G-4']) '४₹' >>> trans = Transliterator('hi-inscript2') >>> trans.transliterate(['G-p']) 'ज़' AltGr-3 ('G-3') is not used though in Hindi-Inscript2. Therefore, 'G-3' transliterates just as 'G-3': >>> trans = Transliterator('hi-inscript2') >>> trans.transliterate(['3', 'G-3']) '३G-3' In mr-inscript2, 'G-1' transliterates to U+200D ZERO WIDTH JOINER ('\xe2\x80\x8d' in UTF-8 encoding): >>> trans = Transliterator('mr-inscript2') >>> trans.transliterate(['j', 'd', 'G-1', '/']).encode('utf-8') b'\xe0\xa4\xb0\xe0\xa5\x8d\xe2\x80\x8d\xe0\xa4\xaf' >>> trans = Transliterator('t-latn-post') >>> trans.transliterate(list('gru"n')) 'grün' >>> trans = Transliterator('NoIme') >>> trans.transliterate(['a', 'b', 'c', 'C-c', 'G-4']) 'abcC-cG-4' >>> trans = Transliterator('ja-anthy') >>> trans.transliterate(['c', 'h', 'o', 'u', 'c', 'h', 'o', 'u']) 'ちょうちょう' >>> trans = Transliterator('zh-py') >>> trans.transliterate(['n', 'i', 'h', 'a', 'o']) '你好' >>> trans = Transliterator('zh-tonepy') >>> trans.transliterate(['n', 'i', '3', 'h', 'a', 'o', '3']) '你好' >>> trans = Transliterator('ko-romaja') >>> trans.transliterate(list('annyeonghaseyo')) '안녕하세요' If initializing the transliterator fails, for example because a non-existing input method was given as the argument, a ValueError is raised: >>> trans = Transliterator('ru-translitx') Traceback (most recent call last): File "/usr/lib64/python3.4/doctest.py", line 1318, in __run compileflags, 1), test.globs) File "<doctest __main__.Transliterator[8]>", line 1, in <module> trans = Transliterator('ru-translitx') File "m17n_translit.py", line 194, in __init__ raise ValueError('minput_open_im() failed') ValueError: minput_open_im() failed ''' def __init__(self, ime): '''Initialize the input method to use for the transliteration Raises ValueError if something fails. :param ime: Full name of the m17n input method, for example “hi-inscript2” or “t-latn-post”. There is one special input method name “NoIme”. The input method “NoIme” is just a dummy which does not transliteration at all, it only joins the list of Msymbol names to a string. :type ime: string ''' self._dummy = False if ime == 'NoIme': self._dummy = True return language = ime.split('-')[0] name = '-'.join(ime.split('-')[1:]) self._im = libm17n__minput_open_im( libm17n__msymbol(ctypes.c_char_p(language.encode('utf-8'))), libm17n__msymbol(ctypes.c_char_p(name.encode('utf-8'))), ctypes.c_void_p(None)) try: _im_contents = self._im.contents except ValueError: # NULL pointer access raise ValueError('minput_open_im() failed') self._ic = libm17n__minput_create_ic(self._im, ctypes.c_void_p(None)) try: _ic_contents = self._ic.contents except ValueError: # NULL pointer access raise ValueError('minput_create_ic() failed') def transliterate(self, msymbol_list): '''Transliterate a list of Msymbol names Returns the transliteration as a string. :param msymbol_list: A list of strings which are interpreted as the names of Msymbols to transliterate. If the input method has the special name “NoIme”, no transliteration is done, the list of Msymbols is just joined to a single string. :type msymbol_list: A list of strings :rtype: string ''' if type(msymbol_list) != type([]): raise ValueError('Argument of transliterate() must be a list.') if self._dummy: return ''.join(msymbol_list) libm17n__minput_reset_ic(self._ic) output = '' for symbol in msymbol_list + ['nil']: _symbol = libm17n__msymbol(symbol.encode('utf-8')) retval = libm17n__minput_filter( self._ic, _symbol, ctypes.c_void_p(None)) if retval == 0: _mt = libm17n__mtext() retval = libm17n__minput_lookup( self._ic, _symbol, ctypes.c_void_p(None), _mt) if libm17n__mtext_len(_mt) > 0: output += mtext_to_string(_mt) if retval and symbol != 'nil': output += symbol return output if __name__ == "__main__": import doctest (failed, attempted) = doctest.testmod() if failed: sys.exit(1) else: sys.exit(0)