Gel4y Mini Shell

Edit file

File name : itb_emoji.py

Content :

# -*- coding: utf-8 -*-
# vim:et sts=4 sw=4
#
# ibus-typing-booster - A completion input method for IBus
#
# Copyright (c) 2015-2018 Mike FABIAN <mfabian@redhat.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>

'''A module used by ibus-typing-booster to match emoji and similar
Unicode characters.

'''

import os
import sys
import re
import gzip
import json
import unicodedata
import html
from distutils.version import LooseVersion
from difflib import SequenceMatcher
import gettext
import itb_util

DOMAINNAME = 'ibus-typing-booster'
_ = lambda a: gettext.dgettext(DOMAINNAME, a)
N_ = lambda a: a

IMPORT_ENCHANT_SUCCESSFUL = False
try:
    import enchant
    IMPORT_ENCHANT_SUCCESSFUL = True
except (ImportError,):
    IMPORT_ENCHANT_SUCCESSFUL = False

IMPORT_PYKAKASI_SUCCESSFUL = False
try:
    from pykakasi import kakasi
    IMPORT_PYKAKASI_SUCCESSFUL = True
    KAKASI_INSTANCE = kakasi()
    KAKASI_INSTANCE.setMode('H', 'a') # default: Hiragana no conversion
    KAKASI_INSTANCE.setMode('K', 'a') # default: Katakana no conversion
    KAKASI_INSTANCE.setMode('J', 'a') # default: Japanese no conversion
    KAKASI_INSTANCE.setMode('r', 'Hepburn') # default: use Hepburn Roman table
    KAKASI_INSTANCE.setMode('C', True) # add space default: no Separator
    KAKASI_INSTANCE.setMode('c', False) # capitalize default: no Capitalize
except (ImportError,):
    IMPORT_PYKAKASI_SUCCESSFUL = False
    KAKASI_INSTANCE = None

IMPORT_PINYIN_SUCCESSFUL = False
try:
    import pinyin
    IMPORT_PINYIN_SUCCESSFUL = True
except (ImportError,):
    IMPORT_PINYIN_SUCCESSFUL = False

DATADIR = os.path.join(os.path.dirname(__file__), '../data')
# USER_DATADIR will be “~/.local/share/ibus-typing-booster/data” by default
USER_DATADIR = itb_util.xdg_save_data_path('ibus-typing-booster/data')
CLDR_ANNOTATION_DIRNAMES = (
    USER_DATADIR, DATADIR,
    # On Fedora >= 25 there is a
    # “cldr-emoji-annotation” package which has the
    # .xml files here in the subdirs “annotations”
    # and “annotationsDerived”:
    '/usr/share/unicode/cldr/common/',
    '/local/mfabian/src/cldr-svn/trunk/common/')

UNICODE_CATEGORIES = {
    'Cc': {'valid': False, 'major': 'Other', 'minor': 'Control'},
    # 'Cf' contains RIGHT-TO-LEFT MARK ...
    'Cf': {'valid': True, 'major': 'Other', 'minor': 'Format'},
    'Cn': {'valid': False, 'major': 'Other', 'minor': 'Not assigned'},
    'Co': {'valid': False, 'major': 'Other', 'minor': 'Private use'},
    'Cs': {'valid': False, 'major': 'Other', 'minor': 'Surrogate'},
    'Ll': {'valid': False, 'major': 'Letter', 'minor': 'Lowercase'},
    'Lm': {'valid': False, 'major': 'Letter', 'minor': 'Modifier'},
    'Lo': {'valid': False, 'major': 'Letter', 'minor': 'Other'},
    'Lt': {'valid': False, 'major': 'Letter', 'minor': 'Titlecase'},
    'Lu': {'valid': False, 'major': 'Letter', 'minor': 'Uppercase'},
    'Mc': {'valid': False, 'major': 'Mark', 'minor': 'Spacing combining'},
    'Me': {'valid': False, 'major': 'Mark', 'minor': 'Enclosing'},
    'Mn': {'valid': False, 'major': 'Mark', 'minor': 'Nonspacing'},
    'Nd': {'valid': False, 'major': 'Number', 'minor': 'Decimal digit'},
    'Nl': {'valid': False, 'major': 'Number', 'minor': 'Letter'},
    # 'No' contains SUPERSCRIPT ONE ...
    'No': {'valid': True, 'major': 'Number', 'minor': 'Other'},
    'Pc': {'valid': True, 'major': 'Punctuation', 'minor': 'Connector'},
    'Pd': {'valid': True, 'major': 'Punctuation', 'minor': 'Dash'},
    'Pe': {'valid': True, 'major': 'Punctuation', 'minor': 'Close'},
    'Pf': {'valid': True, 'major': 'Punctuation', 'minor': 'Final quote'},
    'Pi': {'valid': True, 'major': 'Punctuation', 'minor': 'Initial quote'},
    'Po': {'valid': True, 'major': 'Punctuation', 'minor': 'Other'},
    'Ps': {'valid': True, 'major': 'Punctuation', 'minor': 'Open'},
    'Sc': {'valid': True, 'major': 'Symbol', 'minor': 'Currency'},
    'Sk': {'valid': True, 'major': 'Symbol', 'minor': 'Modifier'},
    'Sm': {'valid': True, 'major': 'Symbol', 'minor': 'Math'},
    'So': {'valid': True, 'major': 'Symbol', 'minor': 'Other'},
    'Zl': {'valid': True, 'major': 'Separator', 'minor': 'Line'},
    'Zp': {'valid': True, 'major': 'Separator', 'minor': 'Paragraph'},
    'Zs': {'valid': True, 'major': 'Separator', 'minor': 'Space'},
}

# VALID_RANGES are taken from ibus-uniemoji
# (but not used anymore at the moment)
VALID_RANGES = (
    (0x0024, 0x0024), # DOLLAR SIGN
    (0x00a2, 0x00a5), # CENT SIGN, POUND SIGN, CURRENCY SIGN, YEN SIGN
    (0x058f, 0x058f), # ARMENIAN DRAM SIGN
    (0x060b, 0x060b), # AFGHANI SIGN
    (0x09f2, 0x09f3), # BENGALI RUPEE MARK, BENGALI RUPEE SIGN
    (0x09fb, 0x09fb), # BENGALI GANDA MARK
    (0x0af1, 0x0af1), # GUJARATI RUPEE SIGN
    (0x0bf9, 0x0bf9), # TAMIL RUPEE SIGN
    (0x0e3f, 0x0e3f), # THAI CURRENCY SYMBOL BAHT
    (0x17db, 0x17db), # KHMER CURRENCY SYMBOL RIEL
    (0x2000, 0x206f), # General Punctuation, Layout Controls,
                      # Invisible Operators
    (0x2070, 0x209f), # Superscripts and Subscripts
    (0x20a0, 0x20cf), # Currency Symbols
    (0x20d0, 0x20ff), # Combining Diacritical Marks for Symbols
    (0x2100, 0x214f), # Additional Squared Symbols, Letterlike Symbols
    (0x2150, 0x218f), # Number Forms
    (0x2190, 0x21ff), # Arrows
    (0x2200, 0x22ff), # Mathematical Operators
    (0x2300, 0x23ff), # Miscellaneous Technical, Floors and Ceilings
    (0x2336, 0x237a), # APL symbols
    (0x2400, 0x243f), # Control Pictures
    (0x2440, 0x245f), # Optical Character Recognition (OCR)
    (0x2460, 0x24ff), # Enclosed Alphanumerics
    (0x2500, 0x257f), # Box Drawing
    (0x2580, 0x259f), # Block Elements
    (0x25a0, 0x25ff), # Geometric Shapes
    (0x2600, 0x26ff), # Miscellaneous Symbols
    (0x2616, 0x2617), # Japanese Chess
    (0x2654, 0x265f), # Chess
    (0x2660, 0x2667), # Card suits
    (0x2630, 0x2637), # Yijing Trigrams
    (0x268a, 0x268f), # Yijing Monograms and Digrams
    (0x26c0, 0x26c3), # Checkers/Draughts
    (0x2700, 0x27bf), # Dingbats
    (0x27c0, 0x27ef), # Miscellaneous Mathematical Symbols-A
    (0x27f0, 0x27ff), # Supplemental Arrows-A
    (0x2800, 0x28ff), # Braille Patterns
    (0x2900, 0x297f), # Supplemental Arrows-B
    (0x2980, 0x29ff), # Miscellaneous Mathematical Symbols-B
    (0x2a00, 0x2aff), # Supplemental Mathematical Operators
    (0x2b00, 0x2bff), # Additional Shapes, Miscellaneous Symbols and Arrows
    (0xa838, 0xa838), # NORTH INDIC RUPEE MARK
    (0xfdfc, 0xfdfc), # RIAL SIGN
    (0xfe69, 0xfe69), # SMALL DOLLAR SIGN
    (0xff01, 0xff60), # Fullwidth symbols and currency signs
    (0x1f300, 0x1f5ff), # Miscellaneous Symbols and Pictographs
    (0x1f600, 0x1f64f), # Emoticons
    (0x1f650, 0x1f67f), # Ornamental Dingbats
    (0x1f680, 0x1f6ff), # Transport and Map Symbols
    (0x1f900, 0x1f9ff), # Supplemental Symbols and Pictographs
)

VALID_CHARACTERS = {
    'ﷺ', # ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM
    'ﷻ', # ARABIC LIGATURE JALLAJALALOUHOU
    '﷽', # ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
}

SKIN_TONE_MODIFIERS = ('🏻', '🏼', '🏽', '🏾', '🏿')

def is_invisible(text):
    '''Checks whether a text is invisible

Returns True if the text is invisible, False if not.

May return True for some texts which are not completely
    invisible but hard to see in most fonts.

:param character: The text
    :type character: String
    :rtype: Boolean

Examples:

>>> is_invisible('a')
    False

>>> is_invisible(' ')
    True

>>> is_invisible(' a')
    False

>>> is_invisible('  ')
    True

>>> is_invisible('')
    True
    '''
    invisible = True
    for character in text:
        if (unicodedata.category(character)
                not in ('Cc', 'Cf', 'Zl', 'Zp', 'Zs')):
            invisible = False
    return invisible

def _in_range(codepoint):
    '''Checks whether the codepoint is in one of the valid ranges

Returns True if the codepoint is in one of the valid ranges,
    else it returns False.

:param codepoint: The Unicode codepoint to check
    :type codepoint: Integer
    :rtype: Boolean

Examples:

>>> _in_range(0x1F915)
    True

>>> _in_range(0x1F815)
    False

>>> _in_range(ord('€'))
    True

>>> _in_range(ord('₹'))
    True

>>> _in_range(ord('₺'))
    True
    '''
    return any([x <= codepoint <= y for x, y in VALID_RANGES])

def _find_path_and_open_function(dirnames, basenames, subdir=''):
    '''Find the first existing file of a list of basenames and dirnames

For each file in “basenames”, tries whether that file or the
    file with “.gz” added can be found in the list of directories
    “dirnames” where “subdir” is added to each directory in the list.

Returns a tuple (path, open_function) where “path” is the
    complete path of the first file found and the open function
    is either “open()” or “gzip.open()”.

:param dirnames: A list of directories to search in
    :type dirnames: List of strings
    :param basenames: A list of file names to search for
    :type basenames: List of strings
    :rtype: A tuple (path, open_function)
    :param subdir: A subdirectory to be added to each directory in the list
    :type subdir: String

'''
    for basename in basenames:
        for dirname in dirnames:
            path = os.path.join(dirname, subdir, basename)
            if os.path.exists(path):
                if path.endswith('.gz'):
                    return (path, gzip.open)
                else:
                    return (path, open)
            path = os.path.join(dirname, basename + '.gz')
            if os.path.exists(path):
                return (path, gzip.open)
    return ('', None)

def find_cldr_annotation_path(language):
    '''
    Finds which CLDR annotation file would be used for the language given

Returns the full path of the  file found or an empty string if
    no file can be found for the language given.

This function is intended to be used by the ibus-typing-booster
    setup tool to check whether CLDR annotations exist for a certain
    language.

:param language: The language to search the annotation file for
    :type language: String
    :rtype: String
    '''
    dirnames = CLDR_ANNOTATION_DIRNAMES
    for language in itb_util.expand_languages([language]):
        basenames = (language + '.xml',)
        (path, open_function) = _find_path_and_open_function(
            dirnames, basenames, subdir='annotations')
        if path:
            return path
    return ''

class EmojiMatcher():
    '''A class to find Emoji which best match a query string'''

def __init__(self, languages=('en_US',),
                 unicode_data=True, unicode_data_all=False,
                 emoji_unicode_min='1.0',
                 emoji_unicode_max='100.0',
                 cldr_data=True, quick=True,
                 non_fully_qualified=False,
                 romaji=True):
        '''
        Initialize the emoji matcher

:param languages: A list of languages to use for matching emoji
        :type languages: List or tuple of strings
        :param unicode_data: Whether to load the UnicodeData.txt file as well
        :type unicode_data: Boolean
        :param unicode_data_all: Whether to load *all* of the Unicode characters
                                  from UnicodeData.txt. If False, most regular
                                  letters are omitted.
        :type unicode_data_all: Boolean
        :param cldr_data: Whether to load data from CLDR as well
        :type cldr_data: Boolean
        :param quick: Whether to do a quicker but slighly less precise match.
                      Quick matching is about 4 times faster and usually
                      good enough.
        :type quick: Boolean
        :param romaji: Whether to add Latin transliteration for Japanese.
                       Works only when pykakasi is available, if this is not
                       the case, this option is ignored.
        :type romaji: Boolean
        '''
        self._languages = languages
        self._gettext_translations = {}
        for language in itb_util.expand_languages(self._languages):
            mo_file = gettext.find(DOMAINNAME, languages=[language])
            if (mo_file
                    and
                    '/' + language  + '/LC_MESSAGES/' + DOMAINNAME + '.mo'
                    in mo_file):
                # Get the gettext translation instance only if a
                # translation file for this *exact* language was
                # found.  Ignore it if only a fallback was found. For
                # example, if “de_DE” was requested and only “de” was
                # found, ignore it.
                try:
                    self._gettext_translations[language] = gettext.translation(
                        DOMAINNAME, languages=[language])
                except (OSError, ):
                    self._gettext_translations[language] = None
            else:
                self._gettext_translations[language] = None
        self._unicode_data_all = unicode_data_all
        self._emoji_unicode_min = emoji_unicode_min
        self._emoji_unicode_max = emoji_unicode_max
        self._quick = quick
        self._non_fully_qualified = non_fully_qualified
        self._romaji = romaji
        self._enchant_dicts = []
        if IMPORT_ENCHANT_SUCCESSFUL:
            for language in self._languages:
                if enchant.dict_exists(language):
                    self._enchant_dicts.append(enchant.Dict(language))
        # From the documentation
        # (https://docs.python.org/3.6/library/difflib.html):
        # “SequenceMatcher computes and caches detailed information
        # about the second sequence, so if you want to compare one
        # sequence against many sequences, use set_seq2() to set the
        # commonly used sequence once and call set_seq1() repeatedly,
        # once for each of the other sequences.”
        self._matcher = SequenceMatcher(
            isjunk=None, a='', b='', autojunk=False)
        self._match_cache = {}
        self._string1 = ''
        self._seq1 = ''
        self._len1 = 0
        self._string2 = ''
        self._string2_number_of_words = 0
        self._string2_word_list = []
        self._seq2 = ''
        self._len2 = 0
        self._emoji_dict = {}
        self._candidate_cache = {}
        # The three data sources are loaded in this order on purpose.
        # The data from Unicode is loaded first to put the official
        # names first into the list of names to display the official
        # names in the candidates, if possible.  The second best names
        # are the long names of emojione.
        if unicode_data:
            self._load_unicode_data()
        self._load_unicode_emoji_data()
        self._load_unicode_emoji_sequences()
        self._load_unicode_emoji_zwj_sequences()
        self._load_unicode_emoji_test()
        self._load_emojione_data()
        if cldr_data:
            for language in itb_util.expand_languages(self._languages):
                self._load_cldr_annotation_data(language, 'annotations')
                self._load_cldr_annotation_data(language, 'annotationsDerived')

def get_languages(self):
        '''Returns a copy of the list of languages of this EmojiMatcher

Useful to check whether an already available EmojiMatcher instance
        can be used or whether one needs a new instance because one needs
        a different list of languages.

Note that the order of that list is important, a matcher which
        supports the same languages but in an different order might
        return different results.

:rtype: A list of strings

Examples:

>>> m = EmojiMatcher(languages = ['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE', 'ja_JP'])
        >>> m.get_languages()
        ['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE', 'ja_JP']

'''
        # Use list() to make a copy instead of self._languages[:] because
        # the latter might return the default tuple ('en_US',) instead
        # of a list ['en_US'] which makes comparison with another list
        # more inconvenient:
        return list(self._languages)

def variation_selector_16_normalize(
            self, emoji_string, non_fully_qualified=False):
        '''Removes or adds emoji presentation selectors (U+FE0F VARIATION
        SELECTOR-16)

Returns the possibly changed sequence.

If emoji_string is equal to '\ufe0f', it is returned unchanged.

See:

http://unicode.org/reports/tr51/#def_fully_qualified_emoji_zwj_sequence
        http://unicode.org/reports/tr51/#def_non_fully_qualified_emoji_zwj_sequence

:param emoji_string: The emoji sequence to change.
        :type emoji_string: String
        :param non_fully_qualified: If True, remove all VS16 characters
                                    If False, make it a fully qualified
                                    sequence using VS16 characters a needed.
        :type non_fully_qualified: Boolean
        :rtype: String

Examples:

>>> matcher = EmojiMatcher()

If non_fully_qualified=True, all variation selectors
        are removed from a sequence, no matter whether the sequence
        was correct or not:

>>> matcher.variation_selector_16_normalize('⛹\ufe0f\u200d♀\ufe0f', non_fully_qualified=True)
        '⛹\u200d♀'

>>> matcher.variation_selector_16_normalize('⛹🏿\u200d♀\ufe0f', non_fully_qualified=True)
        '⛹🏿\u200d♀'

>>> matcher.variation_selector_16_normalize('#\ufe0f⃣', non_fully_qualified=True)
        '#⃣'

>>> matcher.variation_selector_16_normalize('#⃣\ufe0f', non_fully_qualified=True)
        '#⃣'

If non_fully_qualified=False, variation selectors are added to
        sequences as needed and incorrect sequences are repaired:

>>> matcher.variation_selector_16_normalize('⛹🏿\ufe0f\u200d♀\ufe0f', non_fully_qualified=False)
        '⛹🏿\u200d♀\ufe0f'

>>> matcher.variation_selector_16_normalize('⛹\ufe0f🏿\u200d♀\ufe0f', non_fully_qualified=False)
        '⛹🏿\u200d♀\ufe0f'

>>> matcher.variation_selector_16_normalize('⛹\u200d\ufe0f♀', non_fully_qualified=False)
        '⛹\ufe0f\u200d♀\ufe0f'

>>> matcher.variation_selector_16_normalize('#⃣\ufe0f', non_fully_qualified=False)
        '#\ufe0f⃣'

>>> matcher.variation_selector_16_normalize('⛹\ufe0f♀', non_fully_qualified=False)
        '⛹\ufe0f♀\ufe0f'

>>> matcher.variation_selector_16_normalize('⛹', non_fully_qualified=False)
        '⛹\ufe0f'
        '''
        if emoji_string != '\ufe0f':
            emoji_string = emoji_string.replace('\ufe0f', '')
        if non_fully_qualified:
            return emoji_string
        else:
            retval = ''
            length = len(emoji_string)
            for index, character in enumerate(emoji_string):
                retval += character
                if ((character not in SKIN_TONE_MODIFIERS)
                    and ('Emoji' in self.properties(character))
                    and ('Emoji_Presentation'
                         not in self.properties(character))
                    and not (index < length - 1
                             and
                             emoji_string[index + 1]
                             in SKIN_TONE_MODIFIERS)):
                    retval += '\ufe0f'
            return retval

def _add_to_emoji_dict(self, emoji_dict_key, values_key, values):
        '''Adds data to the emoji_dict if not already there'''
        if not emoji_dict_key or not values_key or not values:
            return
        emoji_dict_key = (
            self.variation_selector_16_normalize(
                emoji_dict_key[0], non_fully_qualified=True),
            emoji_dict_key[1])
        if emoji_dict_key not in self._emoji_dict:
            self._emoji_dict[emoji_dict_key] = {}
        if type(values) is list:
            if (values_key not in self._emoji_dict[emoji_dict_key]):
                self._emoji_dict[emoji_dict_key][values_key] = []
            for value in values:
                if (value not in
                        self._emoji_dict[emoji_dict_key][values_key]):
                    self._emoji_dict[emoji_dict_key][values_key] += [value]
        else:
            self._emoji_dict[emoji_dict_key][values_key] = values

def _load_unicode_data(self):
        '''Loads emoji names from UnicodeData.txt'''
        dirnames = (USER_DATADIR, DATADIR,
                    # On Fedora, the “unicode-ucd” package has the
                    # UnicodeData.txt file here:
                    '/usr/share/unicode/ucd')
        basenames = ('UnicodeData.txt',)
        (path, open_function) = _find_path_and_open_function(
            dirnames, basenames)
        if not path:
            sys.stderr.write(
                '_load_unicode_data(): could not find "%s" in "%s"\n'
                %(basenames, dirnames))
            return
        with open_function(path, mode='rt', encoding='utf-8') as unicode_data_file:
            for line in unicode_data_file.readlines():
                if not line.strip():
                    continue
                codepoint_string, name, category = line.split(';')[:3]
                codepoint_integer = int(codepoint_string, 16)
                emoji_string = chr(codepoint_integer)
                if category in ('Cc', 'Co', 'Cs'):
                    # Never load control characters (“Cc”), they cause
                    # too much problems when trying to display
                    # them. Never load the “First” and “Last”
                    # characters of private use characters “Co” and
                    # surrogates (“Cs”) either as these are completely
                    # useless.
                    continue
                if (not self._unicode_data_all
                        and not UNICODE_CATEGORIES[category]['valid']
                        and emoji_string not in VALID_CHARACTERS):
                    continue
                self._add_to_emoji_dict(
                    (emoji_string, 'en'), 'names', [name.lower()])
                self._add_to_emoji_dict(
                    (emoji_string, 'en'),
                    'ucategories', [
                        category,
                        UNICODE_CATEGORIES[category]['major'],
                        UNICODE_CATEGORIES[category]['minor'],
                    ]
                )

def _load_unicode_emoji_data(self):
        '''
        Loads emoji property data from emoji-data.txt

http://unicode.org/Public/emoji/5.0/emoji-data.txt
        '''
        dirnames = (USER_DATADIR, DATADIR)
        basenames = ('emoji-data.txt',)
        (path, open_function) = _find_path_and_open_function(
            dirnames, basenames)
        if not path:
            sys.stderr.write(
                '_load_unicode_emoji_data(): could not find "%s" in "%s"\n'
                %(basenames, dirnames))
            return
        with open_function(path, mode='rt', encoding='utf-8') as unicode_emoji_data_file:
            for line in unicode_emoji_data_file.readlines():
                unicode_version = ''
                pattern = re.compile(
                    r'[^;]*;[^;]*#\s*(?P<uversion>[0-9]+\.[0-9]+)\s*'
                    + r'\[[0-9]+\]')
                match  = pattern.match(line)
                if match and match.group('uversion'):
                    unicode_version = match.group('uversion')
                line = re.sub(r'#.*$', '', line).strip()
                if not line:
                    continue
                codepoint_string, property = [
                    x.strip() for x in line.split(';')[:2]]
                codepoint_range = [
                    int(x, 16) for x in codepoint_string.split('..')]
                if len(codepoint_range) == 1:
                    codepoint_range.append(codepoint_range[0])
                assert len(codepoint_range) == 2
                for codepoint in range(
                        codepoint_range[0], codepoint_range[1] + 1):
                    emoji_string = chr(codepoint)
                    self._add_to_emoji_dict(
                        (emoji_string, 'en'), 'properties', [property])
                    if unicode_version:
                        self._add_to_emoji_dict(
                            (emoji_string, 'en'), 'uversion', unicode_version)

def _load_unicode_emoji_sequences(self):
        '''
        Loads emoji property data from emoji-data.txt

http://unicode.org/Public/emoji/5.0/emoji-sequences.txt
        '''
        dirnames = (USER_DATADIR, DATADIR)
        basenames = ('emoji-sequences.txt',)
        (path, open_function) = _find_path_and_open_function(
            dirnames, basenames)
        if not path:
            sys.stderr.write(
                '_load_unicode_emoji_sequences(): could not find "%s" in "%s"\n'
                %(basenames, dirnames))
            return
        with open_function(path, mode='rt', encoding='utf-8') as unicode_emoji_sequences_file:
            for line in unicode_emoji_sequences_file.readlines():
                unicode_version = ''
                pattern = re.compile(
                    r'[^;]*;[^;]*;[^;]*#\s*(?P<uversion>[0-9]+\.[0-9]+)\s*'
                    + r'\[[0-9]+\]')
                match  = pattern.match(line)
                if match and match.group('uversion'):
                    unicode_version = match.group('uversion')
                line = re.sub(r'#.*$', '', line).strip()
                if not line:
                    continue
                codepoints, property, name = [
                    x.strip() for x in line.split(';')[:3]]
                if codepoints == '0023 FE0F 20E3' and name == 'keycap:':
                    name = 'keycap: #'
                emoji_string = ''
                for codepoint in codepoints.split(' '):
                    emoji_string += chr(int(codepoint, 16))
                if emoji_string:
                    self._add_to_emoji_dict(
                        (emoji_string, 'en'), 'properties', [property])
                    self._add_to_emoji_dict(
                        (emoji_string, 'en'), 'names', [name.lower()])
                    if unicode_version:
                        self._add_to_emoji_dict(
                            (emoji_string, 'en'), 'uversion', unicode_version)

def _load_unicode_emoji_zwj_sequences(self):
        '''
        Loads emoji property data from emoji-zwj-sequences.txt

http://unicode.org/Public/emoji/5.0/emoji-zwj-sequences.txt
        '''
        dirnames = (USER_DATADIR, DATADIR)
        basenames = ('emoji-zwj-sequences.txt',)
        (path, open_function) = _find_path_and_open_function(
            dirnames, basenames)
        if not path:
            sys.stderr.write(
                '_load_unicode_emoji_zwj_sequences(): could not find "%s" in "%s"\n'
                %(basenames, dirnames))
            return
        with open_function(path, mode='rt', encoding='utf-8') as unicode_emoji_zwj_sequences_file:
            for line in unicode_emoji_zwj_sequences_file.readlines():
                unicode_version = ''
                pattern = re.compile(
                    r'[^;]*;[^;]*;[^;]*#\s*(?P<uversion>[0-9]+\.[0-9]+)\s*'
                    + r'\[[0-9]+\]')
                match  = pattern.match(line)
                if match and match.group('uversion'):
                    unicode_version = match.group('uversion')
                line = re.sub(r'#.*$', '', line).strip()
                if not line:
                    continue
                codepoints, property, name = [
                    x.strip() for x in line.split(';')[:3]]
                emoji_string = ''
                for codepoint in codepoints.split(' '):
                    emoji_string += chr(int(codepoint, 16))
                if emoji_string:
                    self._add_to_emoji_dict(
                        (emoji_string, 'en'), 'properties', [property])
                    self._add_to_emoji_dict(
                        (emoji_string, 'en'), 'names', [name.lower()])
                    if unicode_version:
                        self._add_to_emoji_dict(
                            (emoji_string, 'en'), 'uversion', unicode_version)

def _load_unicode_emoji_test(self):
        '''Loads emoji property data from emoji-test.txt

http://unicode.org/Public/emoji/4.0/emoji-test.txt

This is mostly for emoji sorting and for some categorization

Don’t use the 5.0 file until Emoji 5.0 is released and some
        fonts support it, using the 5.0 file now results in lots of
        ugly replacement characters when browsing the categories.

For the other files emoji-data.txt, emoji-zwj-sequences.txt,
        and emoji-sequences.txt it is OK to use the draft 5.0 versions
        as they don’t cause proposed characters to show up when
        browsing the categories. The proposed characters only show up
        when searching and in that case it is OK I think, it is not so
        obviously ugly. And I think it is useful for the proposed
        characters to be searchable.

'''
        dirnames = (USER_DATADIR, DATADIR)
        basenames = ('emoji-test.txt',)
        (path, open_function) = _find_path_and_open_function(
            dirnames, basenames)
        if not path:
            sys.stderr.write(
                '_load_unicode_emoji_test(): could not find "%s" in "%s"\n'
                %(basenames, dirnames))
            return
        with open_function(path, mode='rt', encoding='utf-8') as unicode_emoji_test_file:
            group = ''
            subgroup = ''
            cldr_order = 0
            cldr_group_to_emojione_category = {
                'Smileys & People': N_('people'),
                'Animals & Nature': N_('nature'),
                'Food & Drink': N_('food'),
                'Travel & Places': N_('travel'),
                'Activities': N_('activity'),
                'Objects': N_('objects'),
                'Symbols': N_('symbols'),
                'Flags': N_('flags'),
                'Modifiers': N_('modifier'), # not  in emoji-test.txt
                'Regional': N_('regional'), # not  in emoji-test.txt
            }
            cldr_subgroup_to_emojione_category = {
                'person-sport':  N_('activity'),
            }
            for line in unicode_emoji_test_file.readlines():
                pattern = re.compile(r'# group:(?P<group>.+)$')
                match = pattern.match(line)
                if match and match.group('group'):
                    group = match.group('group').strip()
                    continue
                pattern = re.compile(r'# subgroup:(?P<subgroup>.+)$')
                match = pattern.match(line)
                if match and match.group('subgroup'):
                    subgroup = match.group('subgroup').strip()
                    continue
                name = ''
                pattern = re.compile(r'[^#]+#\s+\S+\s+(?P<name>.+)$')
                match = pattern.match(line)
                if match and match.group('name'):
                    name = match.group('name').strip()
                line = re.sub(r'#.*$', '', line).strip()
                if not line:
                    continue
                codepoints, property = [
                    x.strip() for x in line.split(';')[:2]]
                if property == 'non-fully-qualified':
                    # The non-fully-qualified sequences are
                    # all duplicates of the fully-qualified
                    # sequences.
                    continue
                cldr_order += 1
                emoji_string = ''
                for codepoint in codepoints.split(' '):
                    emoji_string += chr(int(codepoint, 16))
                if emoji_string:
                    categories = [cldr_group_to_emojione_category[group]]
                    if subgroup in cldr_subgroup_to_emojione_category:
                        categories.append(
                            cldr_subgroup_to_emojione_category[subgroup])
                    self._add_to_emoji_dict(
                        (emoji_string, 'en'), 'cldr_order', str(cldr_order))
                    self._add_to_emoji_dict(
                        (emoji_string, 'en'), 'categories', categories)
                    self._add_translated_categories_to_emoji_dict(
                        emoji_string, categories)
                    if name:
                        self._add_to_emoji_dict(
                            (emoji_string, 'en'), 'names', [name.lower()])

def _load_emojione_data(self):
        '''
        Loads emoji names, aliases, keywords, and categories from
        the emojione.json file.
        '''
        dirnames = (USER_DATADIR, DATADIR,
                    # On Fedora >= 25 there is a “nodejs-emojione-json“
                    # package which has the “emoji.json” file here:
                    '/usr/lib/node_modules/emojione/')
        basenames = ('emojione.json', 'emoji.json')
        (path, open_function) = _find_path_and_open_function(
            dirnames, basenames)
        if not path:
            sys.stderr.write(
                '_load_emojione_data(): could not find "%s" in "%s"\n'
                %(basenames, dirnames))
            return
        with open_function(path, mode='rt', encoding='utf-8') as emoji_one_file:
            emojione = json.load(emoji_one_file)
        if '1f600' not in emojione:
            emojione_version = 2
        else:
            emojione_version = 3
        for dummy_emojione_key, emojione_value in emojione.items():
            if emojione_version >= 3:
                codepoints = emojione_value['code_points']['output']
            else:
                codepoints = emojione_value['unicode']
                # ZWJ emojis are in the 'unicode_alt' field:
                if ('unicode_alt' in emojione_value
                        and '200d' in emojione_value['unicode_alt']):
                    codepoints = emojione_value['unicode_alt']

emoji_string = ''.join([
                chr(int(codepoint, 16)) for codepoint in codepoints.split('-')
            ])

# emojione has names like “kiss (woman,woman)”, “couple
            # (man,man)” “family (man,man,girl,boy)”, “cocos (keeling)
            # islands”, “ceuta, melilla” …. The parentheses and commas
            # disturb the matching because my matching assumes that
            # words are seperated only by spaces. And they also match
            # too much for ASCII-smiley query strings like “:-)”. But
            # they are nicer for display. Therefore, if a name
            # contains such characters keep both the original name
            # (for display) and the name with these characters removed
            display_name = emojione_value['name'].lower()
            match_name = re.sub(r' ?[(,)] ?', r' ', display_name).strip(' ')
            names = [display_name]
            shortname = emojione_value[
                'shortname'].replace('_', ' ').strip(':')
            if emojione_version >= 3:
                aliases = [x.replace('_', ' ').strip(':')
                           for x in emojione_value['shortname_alternates']]
                ascii_aliases = emojione_value['ascii']
            else:
                aliases = [x.replace('_', ' ').strip(':')
                           for x in emojione_value['aliases']]
                ascii_aliases = emojione_value['aliases_ascii']
            if match_name not in names:
                names += [match_name]
            if shortname not in names:
                names += [shortname]
            for alias in aliases + ascii_aliases:
                if alias not in names:
                    names += [alias]

categories = [emojione_value['category']]
            # EmojiOne has duplicate entries in the keywords.  The
            # keywords also have random order (maybe because of the
            # way json.load(file) works?), sort them to get
            # reproducible output in the test cases (if the order
            # changes, which keyword matches last may change, that
            # does not change the score but it may have an effect on
            # the additional information added to the display string
            # added because of a keyword match).
            keywords = sorted(list(set(emojione_value['keywords'])))
            if '' in keywords:
                # EmojiOne 3 has some empty strings in the keyword lists
                # remove them:
                keywords.remove('')

if emojione_version >= 3:
                emoji_order = emojione_value['order']
            else:
                emoji_order = emojione_value['emoji_order']

if emoji_string == '🏳🌈':
                # The rainbow flag should be a zwj sequence.
                # This is a bug in emojione version 2:
                # https://github.com/Ranks/emojione/issues/455
                # Fix it here:
                emoji_string = '🏳\u200d🌈'

if (len(emoji_string) == 1
                    and emoji_string in '🇦🇧🇨🇩🇪🇫🇬🇭🇮🇯🇰🇱🇲🇳🇴🇵🇶🇷🇸🇹🇺🇻🇼🇽🇾🇿'):
                # Work around bug in emojione version 3.0
                # https://github.com/Ranks/emojione/issues/476
                # The category should *not* be 'people':
                categories = ['regional']

if emoji_string in SKIN_TONE_MODIFIERS:
                # Work around bug in emojione version 3.0
                # https://github.com/Ranks/emojione/issues/476
                # The category should *not* be 'people':
                categories = ['modifier']

if (len(emoji_string) == 2 and emoji_string[1] == '\ufe0f'
                    and emoji_string[0] in '#*0123456789'):
                # Work around bug in emojione version 3.0
                # https://github.com/Ranks/emojione/issues/476
                # The category should *not* be 'people':
                categories = []

self._add_to_emoji_dict(
                (emoji_string, 'en'), 'names', names)
            self._add_to_emoji_dict(
                (emoji_string, 'en'), 'categories', categories)
            self._add_to_emoji_dict(
                (emoji_string, 'en'), 'keywords', keywords)
            self._add_to_emoji_dict(
                (emoji_string, 'en'), 'emoji_order', emoji_order)

self._add_translated_categories_to_emoji_dict(
                emoji_string, categories)

def _add_translated_categories_to_emoji_dict(
            self, emoji_string, categories):
        '''
        Add translated versions of categories for an emoji
        to self._emoji_dict

:param emoji_string: An emoji
        :type emoji_string: String
        :param categories: The categories of the emoji
        :type categories: List of strings
        '''
        dummy_categories_to_translate = [
            # Translators: This is a name for a category of emoji
            N_('activity'),
            # Translators: This is a name for a category of emoji
            N_('flags'),
            # Translators: This is a name for a category of emoji
            N_('food'),
            # Translators: This is a name for a category of emoji
            N_('modifier'),
            # Translators: This is a name for a category of emoji
            N_('nature'),
            # Translators: This is a name for a category of emoji
            N_('objects'),
            # Translators: This is a name for a category of emoji
            N_('people'),
            # Translators: This is a name for a category of emoji
            N_('regional'),
            # Translators: This is a name for a category of emoji
            N_('symbols'),
            # Translators: This is a name for a category of emoji
            N_('travel'),
        ]

if (IMPORT_PYKAKASI_SUCCESSFUL
                and 'ja' in itb_util.expand_languages(self._languages)):
            KAKASI_INSTANCE.setMode('H', 'H')
            KAKASI_INSTANCE.setMode('K', 'H')
            KAKASI_INSTANCE.setMode('J', 'H')
            kakasi_converter = KAKASI_INSTANCE.getConverter()

for language in itb_util.expand_languages(self._languages):
            if self._gettext_translations[language]:
                translated_categories = []
                for category in categories:
                    translated_category = self._gettext_translations[
                        language].gettext(category)
                    translated_categories.append(
                        translated_category)
                    if language == 'ja' and IMPORT_PYKAKASI_SUCCESSFUL:
                        translated_category_hiragana = (
                            kakasi_converter.do(
                                translated_category))
                        if (translated_category_hiragana
                                != translated_category):
                            translated_categories.append(
                                translated_category_hiragana)
                        if self._romaji:
                            KAKASI_INSTANCE.setMode('H', 'a')
                            KAKASI_INSTANCE.setMode('K', 'a')
                            KAKASI_INSTANCE.setMode('J', 'a')
                            # default: use Hepburn Roman table
                            KAKASI_INSTANCE.setMode('r', 'Hepburn')
                            # add space default: no Separator
                            KAKASI_INSTANCE.setMode('C', True)
                            # capitalize default: no Capitalize
                            KAKASI_INSTANCE.setMode('c', False)
                            kakasi_converter = KAKASI_INSTANCE.getConverter()
                            translated_category_romaji = (
                                kakasi_converter.do(
                                    translated_category)).lower()
                            KAKASI_INSTANCE.setMode('H', 'H')
                            KAKASI_INSTANCE.setMode('K', 'H')
                            KAKASI_INSTANCE.setMode('J', 'H')
                            kakasi_converter = KAKASI_INSTANCE.getConverter()
                            if (translated_category_romaji
                                    != translated_category):
                                translated_categories.append(
                                    translated_category_romaji)
                self._add_to_emoji_dict(
                    (emoji_string, language),
                    'categories', translated_categories)

def _load_cldr_annotation_data(self, language, subdir):
        '''
        Loads translations of emoji names and keywords.

Translations are loaded from the annotation data from CLDR.
        '''
        dirnames = CLDR_ANNOTATION_DIRNAMES
        basenames = (language + '.xml',)
        (path, open_function) = _find_path_and_open_function(
            dirnames, basenames, subdir=subdir)
        if not path:
            return
        # change language to the language of the file which was really
        # found (For example, it could be that 'es_ES' was requested,
        # but only the fallback 'es' was really found):
        language = os.path.basename(
            path).replace('.gz', '').replace('.xml', '')
        with open_function(path, mode='rt', encoding='utf-8') as cldr_annotation_file:
            pattern = re.compile(
                r'.*<annotation cp="(?P<emojistring>[^"]+)"'
                +r'\s*(?P<tts>type="tts"){0,1}'
                +r'[^>]*>'
                +r'(?P<content>.+)'
                +r'</annotation>.*'
            )
            for line in cldr_annotation_file.readlines():
                match = pattern.match(line)
                if match:
                    emoji_string = match.group('emojistring')
                    content = html.unescape(match.group('content'))
                    if language.startswith('en'):
                        content = content.lower()
                    if match.group('tts'):
                        if (language in ('zh', 'zh_Hant')
                                and IMPORT_PINYIN_SUCCESSFUL):
                            self._add_to_emoji_dict(
                                (emoji_string, language),
                                'names',
                                [content,
                                 pinyin.get(content)]
                            )
                        elif (language == 'ja' and IMPORT_PYKAKASI_SUCCESSFUL):
                            KAKASI_INSTANCE.setMode('H', 'H')
                            KAKASI_INSTANCE.setMode('K', 'H')
                            KAKASI_INSTANCE.setMode('J', 'H')
                            kakasi_converter = KAKASI_INSTANCE.getConverter()
                            self._add_to_emoji_dict(
                                (emoji_string, language),
                                'names',
                                [content,
                                 kakasi_converter.do(content)]
                            )
                            if self._romaji:
                                KAKASI_INSTANCE.setMode('H', 'a')
                                KAKASI_INSTANCE.setMode('K', 'a')
                                KAKASI_INSTANCE.setMode('J', 'a')
                                # default: use Hepburn Roman table
                                KAKASI_INSTANCE.setMode('r', 'Hepburn')
                                # add space default: no Separator
                                KAKASI_INSTANCE.setMode('C', True)
                                # capitalize default: no Capitalize
                                KAKASI_INSTANCE.setMode('c', False)
                                kakasi_converter = KAKASI_INSTANCE.getConverter()
                                self._add_to_emoji_dict(
                                    (emoji_string, language),
                                    'names',
                                    [content,
                                     kakasi_converter.do(content).lower()]
                                )
                        else:
                            self._add_to_emoji_dict(
                                (emoji_string, language),
                                'names',
                                [content]
                            )
                    else:
                        if (language in ('zh', 'zh_Hant')
                                and IMPORT_PINYIN_SUCCESSFUL):
                            for x in content.split('|'):
                                keyword = x.strip()
                                keyword_pinyin = pinyin.get(keyword)
                                self._add_to_emoji_dict(
                                    (emoji_string, language),
                                    'keywords',
                                    [keyword, keyword_pinyin]
                                )
                        elif (language == 'ja' and IMPORT_PYKAKASI_SUCCESSFUL):
                            KAKASI_INSTANCE.setMode('H', 'H')
                            KAKASI_INSTANCE.setMode('K', 'H')
                            KAKASI_INSTANCE.setMode('J', 'H')
                            kakasi_converter = KAKASI_INSTANCE.getConverter()
                            for x in content.split('|'):
                                keyword = x.strip()
                                keyword_hiragana = kakasi_converter.do(keyword)
                                self._add_to_emoji_dict(
                                    (emoji_string, language),
                                    'keywords',
                                    [keyword, keyword_hiragana]
                                )
                            if self._romaji:
                                KAKASI_INSTANCE.setMode('H', 'a')
                                KAKASI_INSTANCE.setMode('K', 'a')
                                KAKASI_INSTANCE.setMode('J', 'a')
                                # default: use Hepburn Roman table
                                KAKASI_INSTANCE.setMode('r', 'Hepburn')
                                # add space default: no Separator
                                KAKASI_INSTANCE.setMode('C', True)
                                # capitalize default: no Capitalize
                                KAKASI_INSTANCE.setMode('c', False)
                                kakasi_converter = (
                                    KAKASI_INSTANCE.getConverter())
                                for x in content.split('|'):
                                    keyword = x.strip()
                                    keyword_romaji = kakasi_converter.do(
                                        keyword).lower()
                                    self._add_to_emoji_dict(
                                        (emoji_string, language),
                                        'keywords',
                                        [keyword, keyword_romaji]
                                    )
                        else:
                            self._add_to_emoji_dict(
                                (emoji_string, language),
                                'keywords',
                                [x.strip()
                                 for x in content.split('|')]
                            )

def _set_seq1(self, string):
        '''Sequence 1 is a label from the emoji data'''
        string = itb_util.remove_accents(string).lower()
        self._string1 = string
        if not self._quick:
            # only needed when using SequenceMatcher()
            string = ' ' + string + ' '
            self._seq1 = string
            self._len1 = len(string)
            self._matcher.set_seq1(string)

def _set_seq2(self, string):
        '''Sequence 2 is the query string, i.e. the user input'''
        string = itb_util.remove_accents(string).lower()
        self._string2 = string
        # Split the input string into a list of words:
        word_list = []
        original_words = string.split(sep=None)
        self._string2_number_of_words = len(original_words)
        for word in original_words:
            word_list += [word]
            # If a word in the input string is not correctly spelled
            # in any of the enabled dictionaries, add spell checking
            # suggestions to the list (don’t do that it it is spelled
            # correctly in at least one dictionary):
            if len(word) > 3 and IMPORT_ENCHANT_SUCCESSFUL:
                spelled_correctly = False
                for dic in self._enchant_dicts:
                    if dic.check(word) or dic.check(word.title()):
                        spelled_correctly = True
                if not spelled_correctly: # incorrect in *all* dictionaries
                    wlist = []
                    for dic in self._enchant_dicts:
                        # don’t use spellchecking suggestions shorter then
                        # 3 characters and lower case everything
                        wlist += [
                            x.lower() for x in dic.suggest(word) if len(x) > 2]
                    # remove possible duplicates from spellchecking
                    word_list += set(wlist)
        # Keep duplicates coming from the query string.
        # Sort longest words first.
        self._string2_word_list = sorted(word_list, key=lambda x: -len(x))
        if not self._quick:
            # only needed when using SequenceMatcher()
            string = ' ' + string + ' '
            self._seq2 = string
            self._len2 = len(string)
            self._matcher.set_seq2(string)
            self._match_cache = {}

def _match(self, label, debug=False):
        '''Matches a label from the emoji data against the query string.

The query string must have been already set with
        self._set_seq2(query_string) before calling self._match().

'''
        self._set_seq1(label)
        total_score = 0
        if debug:
            print('string1 = “%s” string2 = “%s” string2_word_list = “%s”'
                  %(self._string1, self._string2, self._string2_word_list))
        if (self._string1, self._string2) in self._match_cache:
            # Many keywords are of course shared by many emoji,
            # therefore the query string is often matched against
            # labels already matched previously. Caching previous
            # matches speeds it up quite a bit.
            total_score = self._match_cache[(self._string1, self._string2)]
            if debug:
                print('Cached, total_score = %s' %total_score)
            return total_score
        # Does the complete query string match exactly?
        if self._string1 == self._string2:
            if debug:
                print('Exact match, total_score += 1000')
            total_score += 1000
        # Does a word in the query string match exactly?
        for word in set(self._string2_word_list):
            # use set() here to avoid making an exact match stronger
            # just because a word happens to be twice in the input.
            if word == self._string1:
                if self._string2_number_of_words == 1:
                    total_score += 300
                    if debug:
                        print('Spell check exact match, word = “%s”, '
                              %word + 'total_score += 300')
                else:
                    total_score += 200
                    if debug:
                        print('Exact match from word_list, word = “%s”, '
                              %word + 'total_score += 200')
        # Does a word in the query string match the beginning of a word in
        # the label?
        tmp = self._string1
        for word in self._string2_word_list:
            match = re.search(r'\b' + re.escape(word), tmp)
            if match:
                match_value = 100 + match.end() - match.start()
                if match.start() == 0:
                    match_value += 20
                total_score += match_value
                tmp = tmp[:match.start()] + tmp[match.end():]
                if debug:
                    print('Substring match from word_list, word = “%s”, '
                          %word
                          + 'total_score += %s' %match_value)
        # Does a word in the query string match the label if spaces in
        # the label are ignored?
        tmp = self._string1.replace(' ', '')
        for word in self._string2_word_list:
            match = re.search(re.escape(word), tmp)
            if match:
                match_value = 20 + match.end() - match.start()
                if match.start() == 0:
                    match_value += 20
                total_score += match_value
                tmp = tmp[:match.start()] + tmp[match.end():]
                if debug:
                    print('Space insensitive substring match from word_list, '
                          + 'word = “%s”, ' %word
                          + 'total_score += %s' %match_value)
        if self._quick:
            self._match_cache[(self._string1, self._string2)] = total_score
            return total_score
        # The following code using SequenceMatcher() might increase
        # the total_score by up to 500 approximately. It improves
        # the matching a little bit but it is very slow.
        if debug:
            print('seq1 = “%s” seq2 = “%s”' %(self._seq1, self._seq2))
        for tag, i1, i2, j1, j2 in self._matcher.get_opcodes():
            score = 0
            if tag in ('replace', 'delete', 'insert'):
                pass
            if tag == 'equal':
                match_length = i2 - i1
                if match_length > 1:
                    score += match_length
                    # favor word boundaries
                    if self._seq1[i1] == ' ':
                        if i1 == 0 and j1 == 0:
                            score += 4 * match_length
                        elif i1 == 0 or j1 == 0:
                            score += 2 * match_length
                        else:
                            score += match_length
                    if i1 > 0 and j1 > 0 and self._seq1[i1 - 1] == ' ':
                        score += match_length
                    if self._seq1[i2 - 1] == ' ':
                        if i2 == self._len1 and j2 == self._len2:
                            score += 4 * match_length
                        elif i2 == self._len1 or j2 == self._len2:
                            score += 2 * match_length
                        else:
                            score += match_length
            total_score += score
            if debug:
                print(
                    '{:7} a[{:2}:{:2}] --> b[{:2}:{:2}]'.format(
                        tag, i1, i2, j1, j2)
                    + '{:3} {:3} {!r} --> {!r}'.format(
                        score, total_score,
                        self._seq1[i1:i2], self._seq2[j1:j2]))
        self._match_cache[(self._string1, self._string2)] = total_score
        return total_score

def candidates(self, query_string, match_limit=20, debug=tuple()):
        '''
        Find a list of emoji which best match a query string.

:param query_string: A search string
        :type query_string: string
        :param match_limit: Limit the number of matches to this amount
        :type match_limit: integer
        :param debug: List or tuple of emojis to print debug information
                      about the matching to stdout.
        :type debug: List of strings
        :rtype: A list of tuples of the form (<emoji>, <name>, <score),
                i.e. a list like this:
                [('🎂', 'birthday cake', 3106), ...]

Examples:

>>> mq = EmojiMatcher(languages = ['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE', 'ja_JP'])

>>> mq.candidates('😺', match_limit=3)
        [('😺', 'smiling cat face with open mouth [😺, So, people, cat, face, mouth, open, smile, grinning cat face]', 9), ('😆', 'smiling face with open mouth and tightly-closed eyes [So, people, face, mouth, open, smile]', 6), ('😄', 'smiling face with open mouth and smiling eyes [So, people, face, mouth, open, smile]', 6)]

>>> mq.candidates('ネコ＿')[0][:2]
        ('🐈', 'ネコ')

>>> mq.candidates('ant')[0][:2]
        ('🐜', 'ant')

>>> mq.candidates('ameise')[0][:2]
        ('🐜', 'Ameise')

>>> mq.candidates('Ameise')[0][:2]
        ('🐜', 'Ameise')

>>> mq.candidates('formica')[0][:2]
        ('🐜', 'formica')

>>> mq.candidates('hormiga')[0][:2]
        ('🐜', 'hormiga')

>>> mq.candidates('cacca')[0][:2]
        ('💩', 'cacca')

>>> mq.candidates('orso')[0][:2]
        ('🐻', 'faccina orso')

>>> mq.candidates('lupo')[0][:2]
        ('🐺', 'faccina lupo')

>>> mq.candidates('gatto')[0][:2]
        ('🐈', 'gatto')

>>> mq.candidates('gatto sorride')[0][:2]
        ('😺', 'gatto che sorride')

Any white space and '_' can be used to separate keywords in the
        query string:

>>> mq.candidates('gatto_	 sorride')[0][:2]
        ('😺', 'gatto che sorride')

>>> mq.candidates('nerd glasses')[0][:2]
        ('🤓', 'nerd face')

>>> mq.candidates('smiling face sun glasses')[0][:2]
        ('😎', 'smiling face with sunglasses')

>>> mq.candidates('halo')[0][:2]
        ('😇', 'smiling face with halo')

>>> mq.candidates('factory')[0][:2]
        ('🏭', 'factory')

>>> mq.candidates('man tone5')[0][:2]
        ('👨🏿', 'man: dark skin tone “man tone5”')

>>> mq.candidates('tone')[0][:2]
        ('🕵🏻', 'detective: light skin tone “sleuth or spy tone1”')

>>> mq.candidates('tone1')[0][:2]
        ('🏻', 'emoji modifier fitzpatrick type-1-2 “tone1”')

>>> mq.candidates('tone5')[0][:2]
        ('🏿', 'emoji modifier fitzpatrick type-6 “tone5”')

>>> mq.candidates('a')[0][:2]
        ('🅰\ufe0f', 'negative squared latin capital letter a')

>>> mq.candidates('squared a')[0][:2]
        ('🅰\ufe0f', 'negative squared latin capital letter a')

>>> mq.candidates('squared capital a')[0][:2]
        ('🅰\ufe0f', 'negative squared latin capital letter a')

>>> mq.candidates('c')[0][:2]
        ('©\ufe0f', 'Copyright')

>>> mq.candidates('us')[0][:2]
        ('🇺🇸', 'united states “us”')

>>> mq.candidates('flag us')[0][:2]
        ('🇺🇸', 'united states “flag: united states”')

>>> mq.candidates('united states')[0][:2]
        ('🇺🇸', 'united states “flag: united states”')

>>> mq.candidates('united')[0][:2]
        ('🇦🇪', 'united arab emirates “flag: united arab emirates”')

>>> mq.candidates('united minor outlying islands')[0][:2]
        ('🇺🇲', 'u.s. outlying islands “flag: u.s. outlying islands”')

>>> mq.candidates('united arab')[0][:2]
        ('🇦🇪', 'united arab emirates “flag: united arab emirates”')

>>> mq.candidates('mm')[0][:2]
        ('🇲🇲', 'myanmar (burma) “mm”')

>>> mq.candidates('flag mm')[0][:2]
        ('🇲🇲', 'myanmar (burma) “flag: myanmar (burma)”')

>>> mq.candidates('myanmar')[0][:2]
        ('🇲🇲', 'myanmar (burma) “flag: myanmar (burma)”')

>>> mq.candidates('sj')[0][:2]
        ('🇸🇯', 'svalbard & jan mayen “sj”')

>>> mq.candidates('flag sj')[0][:2]
        ('🇸🇯', 'svalbard & jan mayen “flag: svalbard & jan mayen”')

>>> mq.candidates('svalbard')[0][:2]
        ('🇸🇯', 'svalbard & jan mayen “flag: svalbard & jan mayen”')

>>> mq.candidates('jan mayen')[0][:2]
        ('🇸🇯', 'svalbard & jan mayen “flag: svalbard & jan mayen”')

>>> mq.candidates('mayen')[0][:2]
        ('🇸🇯', 'svalbard & jan mayen “flag: svalbard & jan mayen”')

>>> mq.candidates(':-)')[0][:2]
        ('🙂', 'slightly smiling face “:-)”')

>>> mq.candidates('family')[0][:2]
        ('👪', 'family')

>>> mq.candidates('man')[0][:2]
        ('👨', 'man')

>>> mq.candidates('woman')[0][:2]
        ('👩', 'woman')

>>> mq.candidates('girl')[0][:2]
        ('👧', 'girl')

>>> mq.candidates('boy')[0][:2]
        ('👦', 'boy')

>>> mq.candidates('family man')[0][:2]
        ('👨\u200d👩\u200d👦', 'family: man, woman, boy “family man woman boy”')

>>> mq.candidates('man man girl boy')[0][:2]
        ('👨\u200d👧\u200d👦', 'family: man, girl, boy “family man girl boy”')

>>> mq.candidates('mmgb')[0][:2]
        ('👨\u200d👨\u200d👧\u200d👦', 'family: man, man, girl, boy “family mmgb”')

>>> mq.candidates('manmangirlboy')[0][:2]
        ('👨\u200d👨\u200d👧\u200d👦', 'family: man, man, girl, boy')

>>> mq.candidates('bird')[0][:2]
        ('🐦', 'bird')

>>> mq.candidates('bir')[0][:2]
        ('🎂', 'birthday cake')

>>> mq.candidates('birth')[0][:2]
        ('🎂', 'birthday cake')

>>> mq.candidates('camera')[0][:2]
        ('📷', 'camera')

>>> mq.candidates('symbol')[0][:2]
        ('🔣', 'input symbol for symbols {Symbol}')

>>> mq.candidates('atomsymbol')[0][:2]
        ('⚛\ufe0f', 'atom symbol')

>>> mq.candidates('peacesymbol')[0][:2]
        ('☮\ufe0f', 'peace symbol')

>>> mq.candidates('peace symbol')[0][:2]
        ('☮\ufe0f', 'peace symbol {Symbol}')

>>> mq.candidates('animal')[0][:2]
        ('🐵', 'cara de mono [animal]')

>>> mq.candidates('dromedary animal')[0][:2]
        ('🐪', 'dromedary camel')

>>> mq.candidates('camel')[0][:2]
        ('🐫', 'bactrian camel')

>>> mq.candidates('people')[0][:2]
        ('👯', 'woman with bunny ears “people with bunny ears partying”')

>>> mq.candidates('nature')[0][:2]
        ('🙈', 'see-no-evil monkey {nature}')

>>> mq.candidates('travel')[0][:2]
        ('\U0001f9f3', 'luggage {travel}')

>>> mq.candidates('ferry')[0][:2]
        ('⛴\ufe0f', 'ferry')

>>> mq.candidates('ferry travel')[0][:2]
        ('⛴\ufe0f', 'ferry {travel}')

>>> mq.candidates('ferry travel boat')[0][:2]
        ('⛴\ufe0f', 'ferry {travel}')

>>> mq.candidates('boat')[0][:2]
        ('🚣🏻\u200d♂️', 'man rowing boat: light skin tone “man rowing boat light skin tone”')

>>> mq.candidates('anchor')[0][:2]
        ('⚓', 'anchor')

>>> mq.candidates('anchor boat')[0][:2]
        ('🚣🏻\u200d♂️', 'man rowing boat: light skin tone “man rowing boat light skin tone”')

>>> mq.candidates('buterfly')[0][:2]
        ('\U0001f98b', 'butterfly')

>>> mq.candidates('badminton')[0][:2]
        ('🏸', 'badminton racquet and shuttlecock')

>>> mq.candidates('badmynton')[0][:2]
        ('🏸', 'badminton racquet and shuttlecock')

>>> mq.candidates('padminton')[0][:2]
        ('🏸', 'badminton racquet and shuttlecock')

>>> mq.candidates('fery')[0][:2]
        ('⛴\ufe0f', 'ferry')

>>> mq.candidates('euro sign')[0][:2]
        ('€', 'euro sign')

>>> mq.candidates('superscript one')[0][:2]
        ('¹', 'superscript one')

>>> mq.candidates('currency')[0][:2]
        ('💱', 'currency exchange')

>>> mq.candidates('connector')[0][:2]
        ('﹎', 'centreline low line {Connector}')

>>> mq.candidates('dash')[0][:2]
        ('💨', 'dash symbol')

>>> mq.candidates('close')[0][:2]
        ('⸥', 'bottom right half bracket {Close}')

>>> mq.candidates('punctuation')[0][:2]
        ('‼\ufe0f', 'double exclamation mark {Punctuation} [punctuation]')

>>> mq.candidates('final quote')[0][:2]
        ('⸅', 'right dotted substitution bracket {Final quote}')

>>> mq.candidates('initial quote')[0][:2]
        ('‟', 'double high-reversed-9 quotation mark {Initial quote}')

>>> mq.candidates('modifier')[0][:2]
        ('🏻', 'emoji modifier fitzpatrick type-1-2 {Modifier}')

>>> mq.candidates('math')[0][:2]
        ('𝜵', 'mathematical bold italic nabla {Math}')

>>> mq.candidates('separator line')[0][:2]
        (' ', 'U+2028 line separator {Line}')

>>> mq.candidates('separator paragraph')[0][:2]
        (' ', 'U+2029 paragraph separator {Paragraph}')

>>> mq.candidates('separator space')[0][:2]
        (' ', 'U+20 space {Space}')

>>> mq = EmojiMatcher(languages = ['fr_FR'])
        >>> mq.candidates('chat')[0][:2]
        ('🐈', 'chat')

>>> mq.candidates('réflexion')[0][:2]
        ('🤔', 'visage en pleine réflexion')

>>> mq.candidates('🤔', match_limit = 3)
        [('🤔', 'visage en pleine réflexion [🤔, réflexion, visage, visage en pleine réflexion]', 4), ('🤩', 'visage avec des étoiles à la place des yeux [visage]', 1), ('🤗', 'visage qui fait un câlin [visage]', 1)]

>>> mq = EmojiMatcher(languages = ['fr_FR'])
        >>> mq.candidates('2019')
        [('’', 'U+2019 right single quotation mark', 200)]

>>> mq.candidates('41')
        [('A', 'U+41 latin capital letter a', 200)]

>>> mq.candidates('2a')
        [('*', 'U+2A asterisk', 200)]

>>> mq.candidates('1b')
        [('\\x1b', 'U+1B', 200)]

>>> mq.candidates('')
        []
        '''
        if not query_string:
            return []
        # self._emoji_dict contains only non-fully-qualified sequences:
        query_string = self.variation_selector_16_normalize(
            query_string, non_fully_qualified=True)
        # Replace any sequence of white space characters and '_'
        # and '＿' in the query string with a single ' '.  '＿'
        # (U+FF3F FULLWIDTH LOW LINE) is included here because when
        # Japanese transliteration is used, something like “neko_”
        # transliterates to “ねこ＿” and that should of course match
        # the emoji for “ねこ”　(= “cat”):
        query_string = re.sub(r'[＿_\s]+', ' ', query_string)
        if ((query_string, match_limit) in self._candidate_cache
                and not debug):
            return self._candidate_cache[(query_string, match_limit)]
        if (query_string, 'en') in self._emoji_dict:
            # the query_string is itself an emoji, match similar ones:
            candidates = self.similar(query_string, match_limit=match_limit)
            self._candidate_cache[(query_string, match_limit)] = candidates
            return candidates
        self._set_seq2(query_string)
        candidates = []
        for emoji_key, emoji_value in self._emoji_dict.items():
            if emoji_key[0] in debug:
                debug_match = True
                print('===================================')
                print('Debug match for “%s”' %emoji_key[0])
                print('===================================')
            else:
                debug_match = False

total_score = 0
            good_match_score = 200
            name_good_match = ''
            ucategory_good_match = ''
            category_good_match = ''
            keyword_good_match = ''
            if 'names' in emoji_value:
                for name in emoji_value['names']:
                    score = 2 * self._match(name, debug=debug_match)
                    if score >= good_match_score:
                        name_good_match = name
                    total_score += score
            if 'ucategories' in emoji_value:
                for ucategory in emoji_value['ucategories']:
                    score = self._match(ucategory, debug=debug_match)
                    if score >= good_match_score:
                        ucategory_good_match = ucategory
                    total_score += score
            if 'categories' in emoji_value:
                for category in emoji_value['categories']:
                    score = self._match(category, debug=debug_match)
                    if score >= good_match_score:
                        category_good_match = category
                    total_score += score
            if 'keywords' in emoji_value:
                for keyword in emoji_value['keywords']:
                    score = self._match(keyword, debug=debug_match)
                    if score >= good_match_score:
                        keyword_good_match = keyword
                    total_score += score

if total_score > 0:
                if 'names' in emoji_value:
                    display_name = emoji_value['names'][0]
                else:
                    display_name = self.name(emoji_key[0])
                if (len(emoji_key[0]) == 1
                        and is_invisible(emoji_key[0])):
                    # Add the code point to the display name of
                    # “invisible” characters:
                    display_name = ('U+%X' %ord(emoji_key[0])
                                    + ' ' + display_name)
                # If the match was good because something else
                # but the main name had a good match, show it in
                # the display name to make the user understand why
                # this emoji matched:
                if name_good_match not in display_name:
                    display_name += ' “' + name_good_match + '”'
                if ucategory_good_match not in display_name:
                    display_name += ' {' + ucategory_good_match + '}'
                if category_good_match not in display_name:
                    display_name += ' {' + category_good_match + '}'
                if keyword_good_match not in display_name:
                    display_name += ' [' + keyword_good_match + ']'
                candidates.append((
                    self.variation_selector_16_normalize(
                        emoji_key[0],
                        self._non_fully_qualified),
                    display_name,
                    total_score))

try:
            codepoint = int(query_string, 16)
            if (codepoint >= 0x0 and codepoint <= 0x1FFFFF
                # exclude surrogates and private use characters:
                and not (codepoint >= 0xd800 and codepoint <= 0xf8ff)
                and not (codepoint >= 0xf0000 and codepoint <= 0xffffd)
                and not (codepoint >= 0x100000 and codepoint <= 0x10fffd)):
                char = chr(codepoint)
                name = self.name(char)
                if not name:
                    try:
                        name = unicodedata.name(char).lower()
                    except (ValueError,):
                        pass
                if name:
                    name = ' ' + name
                candidates.append(
                    (char,
                     'U+' + query_string.upper()
                     + name,
                     good_match_score))
        except (ValueError,):
            pass

sorted_candidates = sorted(
            candidates,
            key=lambda x: (
                - x[2],                # score
                self.cldr_order(x[0]), # CLDR order
                - len(x[0]),           # length of the emoji sequence
                x[1]                   # name of the emoji
            ))[:match_limit]

self._candidate_cache[(query_string, match_limit)] = sorted_candidates
        return sorted_candidates

def names(self, emoji_string, language=''):
        '''Find the names of an emoji

Returns a list of names of the emoji in the language requested
        or and empty list if no name can be found in that language.

If no language is requested, the list of names is returned in
        the first language of this EmojiMatcher for which a list of
        names can be found.

:param emoji_string: The string of Unicode characters which are
                             used to encode the emoji
        :type emoji_string: string
        :param language: The language requested for the name
        :type language: string
        :rtype: List of strings

Examples:

>>> matcher = EmojiMatcher(languages = ['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE', 'ja_JP'])
        >>> matcher.names('🙂')
        ['slightly smiling face', 'slight smile', ':)', ':-)', '=]', '=)', ':]']
        '''
        # self._emoji_dict contains only non-fully-qualified sequences:
        emoji_string = self.variation_selector_16_normalize(
            emoji_string, non_fully_qualified=True)
        if language:
            if ((emoji_string, language) in self._emoji_dict
                    and 'names' in self._emoji_dict[(emoji_string, language)]):
                return self._emoji_dict[(emoji_string, language)]['names']
            else:
                return []
        for language in itb_util.expand_languages(self._languages):
            if ((emoji_string, language) in self._emoji_dict
                    and 'names' in self._emoji_dict[(emoji_string, language)]):
                return self._emoji_dict[(emoji_string, language)]['names']
        return []

def name(self, emoji_string, language=''):
        '''Find the main name of an emoji.

Returns a name of the emoji in the language requested
        or and empty string if no name can be found in that language.

If no language is requested, the name is returned in the first
        language of this EmojiMatcher for which a name can be
        found.

Examples:

>>> matcher = EmojiMatcher(languages = ['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE', 'ja_JP'])

>>> matcher.name('🏸')
        'badminton racquet and shuttlecock'

>>> matcher.name('🖥')
        'desktop computer'

>>> matcher = EmojiMatcher(languages=['es_MX', 'es_ES', 'it_IT', 'ja_JP'])
        >>> matcher.name('🖥')
        'computadora de escritorio'

>>> matcher = EmojiMatcher(languages=['es_ES', 'es_MX', 'it_IT', 'ja_JP'])
        >>> matcher.name('🖥')
        'ordenador de sobremesa'

>>> matcher = EmojiMatcher(languages=['de_DE', 'es_ES', 'es_MX', 'it_IT', 'ja_JP'])
        >>> matcher.name('🖥')
        'Desktopcomputer'

>>> matcher = EmojiMatcher(languages=['it_IT', 'es_ES', 'es_MX', 'ja_JP'])
        >>> matcher.name('🖥')
        'computer fisso'

>>> matcher = EmojiMatcher(languages=['fr_FR'])
        >>> matcher.name('🖥')
        'ordinateur de bureau'

>>> matcher.name('🤔')
        'visage en pleine réflexion'

>>> matcher = EmojiMatcher(languages=['de_DE'])
        >>> matcher.name('🤔')
        'nachdenkendes Gesicht'

>>> matcher.name('⚽')
        'Fußball'

>>> matcher = EmojiMatcher(languages=['de_CH'])
        >>> matcher.name('🤔')
        'nachdenkendes Gesicht'

>>> matcher.name('⚽')
        'Fussball'

>>> matcher.name('a')
        ''

>>> matcher.name(' ')
        'space'
        '''
        names = self.names(emoji_string, language=language)
        if names:
            return names[0]
        else:
            return ''

def keywords(self, emoji_string, language=''):
        '''Return the keywords of an emoji

Returns a list of keywords of the emoji in the language requested
        or and empty list if no keywords can be found in that language.

If no language is requested, the list of keywords is returned in
        the first language of this EmojiMatcher for which a list of
        keywords can be found.

Examples:

>>> matcher = EmojiMatcher(languages = ['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE', 'ja_JP'])
        >>> matcher.keywords('🙂')
        ['face', 'smile', 'slightly smiling face']

>>> matcher.keywords('🙂', language='it')
        ['faccina con sorriso accennato', 'mezzo sorriso', 'sorriso', 'sorriso a bocca chiusa']
        '''
        # self._emoji_dict contains only non-fully-qualified sequences:
        emoji_string = self.variation_selector_16_normalize(
            emoji_string, non_fully_qualified=True)
        if language:
            if ((emoji_string, language) in self._emoji_dict
                    and 'keywords' in self._emoji_dict[(emoji_string, language)]):
                return self._emoji_dict[(emoji_string, language)]['keywords']
            else:
                return []
        for language in itb_util.expand_languages(self._languages):
            if ((emoji_string, language) in self._emoji_dict
                    and 'keywords' in self._emoji_dict[(emoji_string, language)]):
                return self._emoji_dict[(emoji_string, language)]['keywords']
        return []

def categories(self, emoji_string, language=''):
        '''Return the categories of an emoji

Returns a list of categories of the emoji in the language requested
        or and empty list if no categories can be found in that language.

If no language is requested, the list of categories is returned in
        the first language of this EmojiMatcher for which a list of
        categories can be found.

Examples:

>>> matcher = EmojiMatcher(languages = ['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE', 'ja_JP'])
        >>> matcher.categories('🙂')
        ['people']
        '''
        # self._emoji_dict contains only non-fully-qualified sequences:
        emoji_string = self.variation_selector_16_normalize(
            emoji_string, non_fully_qualified=True)
        if language:
            if ((emoji_string, language) in self._emoji_dict
                    and 'categories' in self._emoji_dict[(emoji_string, language)]):
                return self._emoji_dict[(emoji_string, language)]['categories']
            else:
                return []
        for language in itb_util.expand_languages(self._languages):
            if ((emoji_string, language) in self._emoji_dict
                    and 'categories' in self._emoji_dict[(emoji_string, language)]):
                return self._emoji_dict[(emoji_string, language)]['categories']
        return []

def similar(self, emoji_string, match_limit=1000):
        '''Find similar emojis

“Similar” means they share categories or keywords.

:param emoji_string: The string of Unicode  characters which are
                             used to encode the emoji
        :type emoji_string: A string
        :rtype: A list of tuples of the form (<emoji>, <name>, <score>),
                i.e. a list like this:

[('🐫', "cammello ['🐫', 'gobba', 'animale']", 3), ...]

The name includes the list of categories or keywords
                which matched, the score is the number of categories
                or keywords matched.

The list is sorted by preferred language, then score,
                then name.

Examples:

>>> matcher = EmojiMatcher(languages = ['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE', 'ja_JP'])

>>> matcher.similar('this is not an emoji', match_limit = 5)
        []

>>> matcher.similar('☺', match_limit = 5)
        [('☺️', 'white smiling face [☺️, So, people, face, outlined, relaxed, smile, smiling face]', 8), ('🙂', 'slightly smiling face [So, people, face, smile]', 4), ('😙', 'kissing face with smiling eyes [So, people, face, smile]', 4), ('😍', 'smiling face with heart-shaped eyes [So, people, face, smile]', 4), ('😎', 'smiling face with sunglasses [So, people, face, smile]', 4)]

>>> matcher = EmojiMatcher(languages = ['it_IT', 'en_US', 'es_MX', 'es_ES', 'de_DE', 'ja_JP'])
        >>> matcher.similar('☺', match_limit = 5)
        [('☺️', 'faccina sorridente [☺️, contorno faccina sorridente, emozionarsi, faccina, sorridente]', 5), ('😺', 'gatto che sorride [faccina, sorridente]', 2), ('😚', 'faccina che bacia con occhi chiusi [faccina]', 1), ('😙', 'faccina che bacia con occhi sorridenti [faccina]', 1), ('🤗', 'faccina che abbraccia [faccina]', 1)]

>>> matcher = EmojiMatcher(languages = ['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE', 'ja_JP'])
        >>> matcher.similar('🐫', match_limit = 5)
        [('🐫', 'bactrian camel [🐫, bactrian, camel, hump, two humps, two-hump camel]', 6), ('🐪', 'dromedary camel [camel, hump]', 2), ('🐫', 'bactrian camel [🐫, So, nature, bactrian, camel, hump, two-hump camel]', 7), ('🐪', 'dromedary camel [So, nature, hump, camel]', 4), ('\U0001f999', 'llama [So, nature]', 2)]

>>> matcher = EmojiMatcher(languages = [ 'it_IT', 'en_US','es_MX', 'es_ES', 'de_DE', 'ja_JP'])
        >>> matcher.similar('🐫', match_limit = 5)
        [('🐫', 'cammello [🐫, animale, cammello, gobba]', 4), ('🐪', 'dromedario [animale, cammello, gobba]', 3), ('🐐', 'capra [animale]', 1), ('🐑', 'pecora [animale]', 1), ('🐘', 'elefante [animale]', 1)]

>>> matcher = EmojiMatcher(languages = ['de_DE', 'it_IT', 'en_US','es_MX', 'es_ES', 'ja_JP'])
        >>> matcher.similar('🐫', match_limit = 5)
        [('🐫', 'Kamel [🐫, Kamel, Tier, zweihöckrig]', 4), ('🐪', 'Dromedar [Kamel, Tier]', 2), ('🐐', 'Ziege [Tier]', 1), ('🐑', 'Schaf [Tier]', 1), ('🐘', 'Elefant [Tier]', 1)]

>>> matcher = EmojiMatcher(languages = ['es_MX', 'it_IT', 'de_DE', 'en_US', 'es_ES', 'ja_JP'])
        >>> matcher.similar('🐫', match_limit = 5)
        [('🐫', 'camello [🐫, animal, camélido, camello, joroba]', 5), ('🐪', 'dromedario [animal, camélido, joroba]', 3), ('\U0001f999', 'llama [camélido]', 1), ('🐐', 'cabra [animal]', 1), ('🐑', 'oveja [animal]', 1)]

>>> matcher = EmojiMatcher(languages = ['es_ES',  'it_IT', 'es_MX', 'de_DE', 'en_US', 'ja_JP'])
        >>> matcher.similar('🐫', match_limit = 5)
        [('🐫', 'camello [🐫, bactriano, camello, desierto, dromedario, jorobas]', 6), ('🐪', 'dromedario [camello, desierto, dromedario]', 3), ('🏜️', 'desierto [desierto]', 1), ('🐫', 'cammello [🐫, animale, cammello, gobba]', 4), ('🐪', 'dromedario [animale, cammello, gobba]', 3)]

>>> matcher = EmojiMatcher(languages = ['es_ES',  'it_IT', 'es_MX', 'de_DE', 'en_US', 'ja_JP'])
        >>> matcher.similar('€', match_limit = 5)
        [('€', 'euro sign [€, Sc]', 2), ('؋', 'afghani sign [Sc]', 1), ('֏', 'armenian dram sign [Sc]', 1), ('₳', 'austral sign [Sc]', 1), ('৻', 'bengali ganda mark [Sc]', 1)]

>>> matcher.similar('🏄‍♂', match_limit = 2)
        [('🏄\u200d♂️', 'hombre haciendo surf [🏄\u200d♂️, hombre, hombre haciendo surf, surf, surfero, surfista]', 6), ('🏄🏻\u200d♂️', 'hombre haciendo surf: tono de piel claro [hombre, hombre haciendo surf, surf, surfero, surfista]', 5)]
        '''
        # self._emoji_dict contains only non-fully-qualified sequences:
        emoji_string = self.variation_selector_16_normalize(
            emoji_string, non_fully_qualified=True)
        candidate_scores = {}
        original_labels = {}
        expanded_languages = itb_util.expand_languages(self._languages)
        label_keys = ('ucategories', 'categories', 'keywords')
        for language in expanded_languages:
            original_labels[language] = set()
            emoji_key = (emoji_string, language)
            if emoji_key not in self._emoji_dict:
                continue
            for label_key in label_keys:
                if label_key in self._emoji_dict[emoji_key]:
                    for label in self._emoji_dict[emoji_key][label_key]:
                        original_labels[language].add(label)
                        if (label_key == 'ucategories'
                                and label in UNICODE_CATEGORIES):
                            # For example, label could be 'So' in this
                            # case.  The next two labels will be
                            # 'Symbol' and 'Other' then. In almost all
                            # cases, adding these as well to
                            # original_labels_for_language would not
                            # change the final result. It would only
                            # add two more strings to the list of
                            # matching labels for *every* similar
                            # emoji. Therefore, it would only make the
                            # candidate list for similar emoji much
                            # wider without giving any extra
                            # information to the user. Better skip
                            # the rest of labels in this case.
                            break
        for similar_key in self._emoji_dict:
            similar_string = similar_key[0]
            language = similar_key[1]
            if 'names' in self._emoji_dict[similar_key]:
                similar_name = self._emoji_dict[similar_key]['names'][0]
            else:
                similar_name = self.name(similar_string)
            if (len(similar_string) == 1
                    and is_invisible(similar_string)):
                # Add the code point to the display name of
                # “invisible” characters:
                similar_name = ('U+%X' %ord(similar_string)
                                + ' ' + similar_name)
            scores_key = (
                similar_string, language, similar_name)
            if similar_string == emoji_string:
                # This is exactly the same emoji, add the emoji
                # itself as one extra label.  This way, the
                # original emoji gets a higher score then emoji
                # which share all categories and all keywords.
                # The most similar emoji should always be the
                # original emoji itself.
                candidate_scores[scores_key] = [
                    self.variation_selector_16_normalize(
                        emoji_string,
                        non_fully_qualified=self._non_fully_qualified)]
            for label_key in label_keys:
                if label_key in self._emoji_dict[similar_key]:
                    for label in self._emoji_dict[similar_key][label_key]:
                        if label in original_labels[language]:
                            if scores_key in candidate_scores:
                                candidate_scores[scores_key].append(label)
                            else:
                                candidate_scores[scores_key] = [label]
        candidates = []
        cldr_order_emoji_string = self.cldr_order(emoji_string)
        for x in sorted(candidate_scores.items(),
                        key=lambda x: (
                            expanded_languages.index(x[0][1]), # language index
                            - len(x[1]), # number of matching labels
                            # abs(difference in cldr_order):
                            + abs(self.cldr_order(x[0][0]) - cldr_order_emoji_string),
                            self.cldr_order(x[0][0]), # CLDR order
                            - len(x[0][0]), # length of emoji string
                            x[0][2], # emoji name
                        ))[:match_limit]:
            emoji = self.variation_selector_16_normalize(
                x[0][0],
                non_fully_qualified=self._non_fully_qualified)
            name = x[0][2] + ' [' + ', '.join(x[1]) + ']'
            score = len(x[1])
            candidates.append((emoji, name, score))
        return candidates

def emoji_by_label(self):
        '''
        :rtype:
        '''
        label_keys = ('ucategories', 'categories', 'keywords', 'names')
        emoji_by_label_dict = {}
        for label_key in label_keys:
            for emoji_key, emoji_value in self._emoji_dict.items():
                emoji = self.variation_selector_16_normalize(
                    emoji_key[0],
                    non_fully_qualified=self._non_fully_qualified)
                unicode_version  = self.unicode_version(emoji)
                if (unicode_version
                        and (LooseVersion(unicode_version)
                             < LooseVersion(self._emoji_unicode_min)
                             or LooseVersion(unicode_version)
                             > LooseVersion(self._emoji_unicode_max))):
                    continue
                if len(emoji) > 1:
                    has_skin_tone_modifier = False
                    for modifier in SKIN_TONE_MODIFIERS:
                        if modifier in emoji:
                            has_skin_tone_modifier = True
                    if has_skin_tone_modifier:
                        # Skip all emoji which already contain a
                        # skin tone modifier, the skin tone variants
                        # will be created when needed when browsing
                        # the categories in emoji-picker:
                        continue
                language = emoji_key[1]
                if not language in emoji_by_label_dict:
                    emoji_by_label_dict[language] = {}
                if label_key in emoji_value:
                    if not label_key in emoji_by_label_dict[language]:
                        emoji_by_label_dict[language][label_key] = {}
                    if label_key == 'ucategories':
                        ucategory_label_full = ', '.join(
                            emoji_value[label_key])
                        if (not ucategory_label_full
                                in emoji_by_label_dict[language][label_key]):
                            emoji_by_label_dict[
                                language][
                                    label_key][
                                        ucategory_label_full] = [emoji]
                        else:
                            emoji_by_label_dict[
                                language][
                                    label_key][
                                        ucategory_label_full].append(emoji)
                    else:
                        for label in emoji_value[label_key]:
                            if (not label in
                                    emoji_by_label_dict[language][label_key]):
                                emoji_by_label_dict[
                                    language][
                                        label_key][
                                            label] = [emoji]
                            else:
                                emoji_by_label_dict[
                                    language][
                                        label_key][
                                            label].append(emoji)
        for language in emoji_by_label_dict:
            for label_key in emoji_by_label_dict[language]:
                for label in emoji_by_label_dict[language][label_key]:
                    emoji_by_label_dict[language][label_key][label] = sorted(
                        emoji_by_label_dict[language][label_key][label],
                        key=lambda x: (
                            self.cldr_order(x),
                            x,
                        ))
        return emoji_by_label_dict

def emoji_order(self, emoji_string):
        '''Returns the “emoji_order” number from emojione

Useful for sorting emoji. For characters which do not
        have an emoji order, 0xffffffff is returned.

:param emoji_string: An emoji
        :type emoji_string: String
        :rtype: Integer

Examples:

>>> matcher = EmojiMatcher(languages = ['en'])
        >>> matcher.emoji_order('😀')
        1

>>> hex(matcher.emoji_order('∬'))
        '0xffffffff'
        '''
        # self._emoji_dict contains only non-fully-qualified sequences:
        emoji_string = self.variation_selector_16_normalize(
            emoji_string, non_fully_qualified=True)
        if ((emoji_string, 'en') in self._emoji_dict
                and 'emoji_order' in self._emoji_dict[(emoji_string, 'en')]):
            return int(self._emoji_dict[(emoji_string, 'en')]['emoji_order'])
        return 0xFFFFFFFF

def cldr_order(self, emoji_string):
        '''Returns a “cldr_order” number from CLDR

Useful for sorting emoji. For characters which do not
        have a “cldr_order” number, 0xffffffff is returned.

The “cldr_order” number is generated  by parsing
        emoji-test.txt.

:param emoji_string: An emoji
        :type emoji_string: String
        :rtype: Integer

Examples:

>>> matcher = EmojiMatcher(languages = ['en'])
        >>> matcher.cldr_order('😀')
        1

>>> hex(matcher.cldr_order('∬'))
        '0xffffffff'
        '''
        # self._emoji_dict contains only non-fully-qualified sequences:
        emoji_string = self.variation_selector_16_normalize(
            emoji_string, non_fully_qualified=True)
        if ((emoji_string, 'en') in self._emoji_dict
                and 'cldr_order' in self._emoji_dict[(emoji_string, 'en')]):
            return int(self._emoji_dict[(emoji_string, 'en')]['cldr_order'])
        return 0xFFFFFFFF

def properties(self, emoji_string):
        '''
        Returns the emoji properties of this emoji from the unicode.org data

:param emoji_string: An emoji
        :type emoji_string: String
        :rtype: List of strings
        '''
        # self._emoji_dict contains only non-fully-qualified sequences:
        emoji_string = self.variation_selector_16_normalize(
            emoji_string, non_fully_qualified=True)
        if (((emoji_string, 'en') in self._emoji_dict)
            and ('properties' in self._emoji_dict[(emoji_string, 'en')])):
            return self._emoji_dict[(emoji_string, 'en')]['properties']
        else:
            return []

def unicode_version(self, emoji_string):
        '''
        Returns the Unicode version when this emoji/character was added

:param emoji_string: An emoji
        :type emoji_string: String
        :rtype: String
        '''
        # self._emoji_dict contains only non-fully-qualified sequences:
        emoji_string = self.variation_selector_16_normalize(
            emoji_string, non_fully_qualified=True)
        if (((emoji_string, 'en') in self._emoji_dict)
            and ('uversion' in self._emoji_dict[(emoji_string, 'en')])):
            return self._emoji_dict[(emoji_string, 'en')]['uversion']
        else:
            return ''

def skin_tone_modifier_supported(self, emoji_string):
        '''Checks whether skin tone modifiers are possible for this emoji

Returns True if skin  tone modifiers  are possible
        for this emoji_string, False if not.

:param emoji_string: The emoji to check
        :type emoji_string: String
        :rtype: Boolean

Examples:

>>> matcher = EmojiMatcher(languages = ['en'])
        >>> matcher.skin_tone_modifier_supported('👩')
        True

>>> matcher.skin_tone_modifier_supported('👩🏻')
        True

>>> matcher.skin_tone_modifier_supported('👮\u200d♀')
        True

>>> matcher.skin_tone_modifier_supported('😀')
        False

>>> matcher.skin_tone_modifier_supported('😀🏻')
        False

>>> matcher.skin_tone_modifier_supported('')
        False

>>> matcher.skin_tone_modifier_supported('🏻')
        False
        '''
        if len(self.skin_tone_variants(emoji_string)) > 1:
            return True
        else:
            return False

def skin_tone_variants(self, emoji_string):
        '''
        Returns a list of skin tone variants for the given emoji

If the given emoji does not support skin tones, a list
        containing only the original emoji is returned.

:param emoji_string: The emoji to check
        :type emoji_string: String
        :rtype: List of strings

Examples:

>>> matcher = EmojiMatcher(languages = ['en'])
        >>> matcher.skin_tone_variants('👩')
        ['👩', '👩🏻', '👩🏼', '👩🏽', '👩🏾', '👩🏿']

>>> matcher.skin_tone_variants('👩🏻')
        ['👩', '👩🏻', '👩🏼', '👩🏽', '👩🏾', '👩🏿']

>>> matcher.skin_tone_variants('👮\u200d♀\ufe0f')
        ['👮\u200d♀\ufe0f', '👮🏻\u200d♀\ufe0f', '👮🏼\u200d♀\ufe0f', '👮🏽\u200d♀\ufe0f', '👮🏾\u200d♀\ufe0f', '👮🏿\u200d♀\ufe0f']

>>> matcher.skin_tone_variants('👩\u200d🎓')
        ['👩\u200d🎓', '👩🏻\u200d🎓', '👩🏼\u200d🎓', '👩🏽\u200d🎓', '👩🏾\u200d🎓', '👩🏿\u200d🎓']

>>> matcher.skin_tone_variants('😀')
        ['😀']

>>> matcher.skin_tone_variants('😀🏿')
        ['😀🏿']

>>> matcher.skin_tone_variants('')
        ['']

>>> matcher.skin_tone_variants('🏿')
        ['🏿']

# Family: woman, girl
        # See: http://unicode.org/Public/emoji/5.0/emoji-zwj-sequences.txt
        # which contains the line:
        #
        # 1F469 200D 1F467; Emoji_ZWJ_Sequence; family: woman, girl # 6.0  [1] (👩‍👧)
        >>> len(matcher.skin_tone_variants('👩\u200d👧'))
        36

>>> len(matcher.skin_tone_variants('👩🏼\u200d👧🏿'))
        36

>>> matcher.skin_tone_variants('👩🏼\u200d👧🏿') == matcher.skin_tone_variants('👩\u200d👧')
        True

>>> matcher.skin_tone_variants('👩\u200d👧')
        ['👩\u200d👧', '👩\u200d👧🏻', '👩\u200d👧🏼', '👩\u200d👧🏽', '👩\u200d👧🏾', '👩\u200d👧🏿', '👩🏻\u200d👧', '👩🏻\u200d👧🏻', '👩🏻\u200d👧🏼', '👩🏻\u200d👧🏽', '👩🏻\u200d👧🏾', '👩🏻\u200d👧🏿', '👩🏼\u200d👧', '👩🏼\u200d👧🏻', '👩🏼\u200d👧🏼', '👩🏼\u200d👧🏽', '👩🏼\u200d👧🏾', '👩🏼\u200d👧🏿', '👩🏽\u200d👧', '👩🏽\u200d👧🏻', '👩🏽\u200d👧🏼', '👩🏽\u200d👧🏽', '👩🏽\u200d👧🏾', '👩🏽\u200d👧🏿', '👩🏾\u200d👧', '👩🏾\u200d👧🏻', '👩🏾\u200d👧🏼', '👩🏾\u200d👧🏽', '👩🏾\u200d👧🏾', '👩🏾\u200d👧🏿', '👩🏿\u200d👧', '👩🏿\u200d👧🏻', '👩🏿\u200d👧🏼', '👩🏿\u200d👧🏽', '👩🏿\u200d👧🏾', '👩🏿\u200d👧🏿']

>>> len(matcher.skin_tone_variants('👨\u200d👩\u200d👧\u200d👦'))
        1296

# Woman in lotus position
        # Does support skin tone in http://unicode.org/Public/emoji/5.0/emoji-data.txt
        # which contains the line:
        #
        # “1F9D1..1F9DD  ; Emoji_Modifier_Base  #10.0 [13] (🧑..🧝)    adult..elf”
        >>> matcher.skin_tone_variants('🧘\u200d♀\ufe0f')
        ['\U0001f9d8\u200d♀\ufe0f', '\U0001f9d8🏻\u200d♀\ufe0f', '\U0001f9d8🏼\u200d♀\ufe0f', '\U0001f9d8🏽\u200d♀\ufe0f', '\U0001f9d8🏾\u200d♀\ufe0f', '\U0001f9d8🏿\u200d♀\ufe0f']

>>> matcher.skin_tone_variants('🏌\ufe0f\u200d♂\ufe0f')
        ['🏌\ufe0f\u200d♂\ufe0f', '🏌🏻\u200d♂\ufe0f', '🏌🏼\u200d♂\ufe0f', '🏌🏽\u200d♂\ufe0f', '🏌🏾\u200d♂\ufe0f', '🏌🏿\u200d♂\ufe0f']

>>> matcher.skin_tone_variants('✌\ufe0f')
        ['✌\ufe0f', '✌🏻', '✌🏼', '✌🏽', '✌🏾', '✌🏿']

>>> matcher = EmojiMatcher(languages = ['en'], non_fully_qualified=True)
        >>> matcher.skin_tone_variants('🏌\ufe0f\u200d♂\ufe0f')
        ['🏌\u200d♂', '🏌🏻\u200d♂', '🏌🏼\u200d♂', '🏌🏽\u200d♂', '🏌🏾\u200d♂', '🏌🏿\u200d♂']

>>> matcher.skin_tone_variants('🏌\u200d♂')
        ['🏌\u200d♂', '🏌🏻\u200d♂', '🏌🏼\u200d♂', '🏌🏽\u200d♂', '🏌🏾\u200d♂', '🏌🏿\u200d♂']
        '''
        if not emoji_string or emoji_string in SKIN_TONE_MODIFIERS:
            return [emoji_string]
        emoji_string = self.variation_selector_16_normalize(
            emoji_string, non_fully_qualified=True)
        if 'Emoji_Modifier_Base' in self.properties(emoji_string):
            return [
                self.variation_selector_16_normalize(
                    emoji_string + tone,
                    non_fully_qualified=self._non_fully_qualified)
                for tone in ('',) + SKIN_TONE_MODIFIERS]
        if ((emoji_string[-1] in SKIN_TONE_MODIFIERS)
            and ((emoji_string, 'en') in self._emoji_dict)):
            return [
                self.variation_selector_16_normalize(
                    emoji_string[:-1] + tone,
                    non_fully_qualified=self._non_fully_qualified)
                for tone in ('',) + SKIN_TONE_MODIFIERS]
        emoji_parts = emoji_string.split('\u200d')
        if len(emoji_parts) >= 2 and len(emoji_parts) <= 4:
            for modifier in SKIN_TONE_MODIFIERS:
                for i, emoji_part in enumerate(emoji_parts):
                    emoji_parts[i] = emoji_part.replace(modifier, '')
            skin_tone_variants = []
            if len(emoji_parts) == 2:
                for variant0 in self.skin_tone_variants(emoji_parts[0]):
                    for variant1 in self.skin_tone_variants(emoji_parts[1]):
                        skin_tone_variants.append(
                            variant0
                            + '\u200d'
                            + variant1)
            if len(emoji_parts) == 3:
                for variant0 in self.skin_tone_variants(emoji_parts[0]):
                    for variant1 in self.skin_tone_variants(emoji_parts[1]):
                        for variant2 in self.skin_tone_variants(emoji_parts[2]):
                            skin_tone_variants.append(
                                variant0
                                + '\u200d'
                                + variant1
                                + '\u200d'
                                + variant2)
            if len(emoji_parts) == 4:
                for variant0 in self.skin_tone_variants(emoji_parts[0]):
                    for variant1 in self.skin_tone_variants(emoji_parts[1]):
                        for variant2 in self.skin_tone_variants(emoji_parts[2]):
                            for variant3 in self.skin_tone_variants(emoji_parts[3]):
                                skin_tone_variants.append(
                                    variant0
                                    + '\u200d'
                                    + variant1
                                    + '\u200d'
                                    + variant2
                                    + '\u200d'
                                    + variant3)
            if skin_tone_variants:
                return skin_tone_variants
        return [self.variation_selector_16_normalize(
            emoji_string,
            non_fully_qualified=self._non_fully_qualified)]

def debug_loading_data(self):
        '''To debug whether the data has been loaded correctly'''
        count = 0
        for key, value in sorted(self._emoji_dict.items()):
            print("key=%s value=%s" %(key, sorted(value.items())))
            count += 1
        print('count=%s' %count)

if IMPORT_PINYIN_SUCCESSFUL:
        def _doctest_pinyin(self):
            '''
            >>> matcher = EmojiMatcher(languages = ['zh_CN'])
            >>> matcher.candidates('saima')[0][:2]
            ('🏇', '赛马 “sàimǎ”')

>>> matcher.similar('🏇', match_limit=3)
            [('🏇', '赛马 [🏇, 赛马, sàimǎ, 马, mǎ]', 5), ('🏇🏻', '赛马: 种类-1-2 [赛马, sàimǎ, 马, mǎ]', 4), ('🏇🏼', '赛马: 种类-3 [赛马, sàimǎ, 马, mǎ]', 4)]

>>> matcher = EmojiMatcher(languages = ['zh_TW'])

>>> matcher.candidates('saima')[0][:2]
            ('🏇', '賽馬 “sàimǎ”')

>>> matcher.similar('🏇', match_limit=1)
            [('🏇', '賽馬 [🏇, 騎馬, qímǎ]', 3)]
            '''

if IMPORT_PYKAKASI_SUCCESSFUL:
        def _doctest_pykakasi(self):
            '''
            >>> matcher = EmojiMatcher(languages = ['ja_JP'], romaji=True)
            >>> matcher.candidates('katatsumuri')[0][:2]
            ('🐌', 'かたつむり “katatsumuri”')

>>> matcher.candidates('ねこ＿')[0][:2]
            ('🐈', 'ネコ “ねこ”')

>>> matcher.similar('🐤', match_limit=5)
            [('🐤', 'ひよこ [🐤, ひな, ひよこ, 動物, どうぶつ, 横を向いているひよこ, よこをむいているひよこ, 顔, かお, 鳥, とり, hina, hiyoko, doubutsu, yokowomuiteiruhiyoko, kao, tori]', 17), ('🐣', '卵からかえったひよこ [ひな, ひよこ, 動物, どうぶつ, 鳥, とり, hina, hiyoko, doubutsu, tori]', 10), ('🐥', '前を向いているひよこ [ひな, ひよこ, 動物, どうぶつ, 鳥, とり, hina, hiyoko, doubutsu, tori]', 10), ('🐦', '鳥 [動物, どうぶつ, 顔, かお, 鳥, とり, doubutsu, kao, tori]', 9), ('🐔', 'にわとり [動物, どうぶつ, 顔, かお, 鳥, とり, doubutsu, kao, tori]', 9)]

>>> matcher.similar('🐌', match_limit=5)
            [('🐌', 'かたつむり [🐌, かたつむり, でんでん虫, でんでんむし, 虫, むし, katatsumuri, dendenmushi, mushi]', 9), ('🦋', 'チョウ [虫, むし, mushi]', 3), ('🐛', '毛虫 [虫, むし, mushi]', 3), ('🐜', 'アリ [虫, むし, mushi]', 3), ('🐝', 'ミツバチ [虫, むし, mushi]', 3)]

>>> matcher.similar('😱', match_limit=5)
            [('😱', '恐怖 [😱, がーん, ショック, しょっく, 叫び, さけび, 恐怖, きょうふ, 顔, かお, ga-n, shokku, sakebi, kyoufu, kao]', 15), ('🙀', '絶望する猫 [がーん, ショック, しょっく, 顔, かお, ga-n, shokku, kao]', 8), ('🤯', '頭爆発 [ショック, しょっく, 顔, かお, shokku, kao]', 6), ('😨', '青ざめ [がーん, 顔, かお, ga-n, kao]', 5), ('😰', '冷や汗青ざめ [顔, かお, kao]', 3)]

'''

def list_emoji_one_bugs(self):
        '''
        '''
        print('--------------------------------------------------')
        print('Possible bugs in emojione.json:')
        print('--------------------------------------------------')
        print('\n')
        for emoji_key, emoji_value in sorted(self._emoji_dict.items()):
            if emoji_key[1] == 'en':
                if ((emoji_key[0] + SKIN_TONE_MODIFIERS[0], 'en')
                    in self._emoji_dict):
                    if (not 'Emoji_Modifier_Base'
                        in self.properties(emoji_key[0])):
                        print('emoji “%s” (U+%X) has skintones in emojione '
                              %(emoji_key[0], ord(emoji_key[0]))
                              + 'but not the Emoji_Modifier_Base '
                              + 'property in emoji-data.txt.')
                if 'Emoji_Modifier_Base' in self.properties(emoji_key[0]):
                    if (not 'emoji_order'
                        in self._emoji_dict[
                            (emoji_key[0] + SKIN_TONE_MODIFIERS[0], 'en')]):
                        print('emoji “%s” (U+%X) '
                              %(emoji_key[0], ord(emoji_key[0]))
                              + 'has the property Emoji_Modifier_Base '
                              + 'in emoji-data.txt but no skin tones '
                              + 'in emojione.')
                if 'Emoji_ZWJ_Sequence' in self.properties(emoji_key[0]):
                    if ('emoji_order'
                        not in self._emoji_dict[(emoji_key[0], 'en')]):
                        print('ZWJ sequence “%s” '
                              %emoji_key[0]
                              + 'from unicode.org missing in emojione')
                else:
                    if (('emoji_order'
                        in self._emoji_dict[(emoji_key[0], 'en')])
                        and '\u200d' in emoji_key[0]):
                        print('ZWJ sequence “%s” '
                              %emoji_key[0]
                              + 'in emojione but not in unicode.org')

BENCHMARK = True

def main():
    '''
    Used for testing and profiling.

“python3 itb_emoji.py”

runs some tests and prints profiling data.
    '''
    if BENCHMARK:
        import cProfile
        import pstats
        profile = cProfile.Profile()
        profile.enable()

failed = False
    if False:
        matcher = EmojiMatcher(
            languages=['en_US', 'it_IT', 'es_MX', 'es_ES', 'de_DE',
                       'ja_JP', 'zh_TW', 'zh_CN'],
            unicode_data=True, cldr_data=True)
        matcher.debug_loading_data()
        matcher.list_emoji_one_bugs()
    else:
        import doctest
        # Set the domain name to something invalid to avoid using
        # the translations for the doctest tests. Translations may
        # make the tests fail just because some translations are
        # added, changed, or missing.
        global DOMAINNAME
        DOMAINNAME = ''
        (failed, dummy_attempted) = doctest.testmod()

if BENCHMARK:
        profile.disable()
        stats = pstats.Stats(profile)
        stats.strip_dirs()
        stats.sort_stats('cumulative')
        stats.print_stats('itb_emoji', 25)
        stats.print_stats('difflib', 25)
        stats.print_stats('enchant', 25)

if failed:
        sys.exit(1)
    else:
        sys.exit(0)

if __name__ == "__main__":
    main()