Source code for dragonmapper.hanzi

# -*- coding: utf-8 -*-
"""Identification and transliteration functions for Chinese characters."""

import re

import hanzidentifier
import zhon.hanzi
import zhon.pinyin

import dragonmapper.data
from dragonmapper.transcriptions import (
    accented_to_numbered,
    pinyin_to_ipa,
    pinyin_to_zhuyin,
)

UNKNOWN = hanzidentifier.UNKNOWN
BOTH = hanzidentifier.BOTH
MIXED = hanzidentifier.MIXED
TRAD = TRADITIONAL = hanzidentifier.TRADITIONAL
SIMP = SIMPLIFIED = hanzidentifier.SIMPLIFIED
identify = hanzidentifier.identify
is_simplified = hanzidentifier.is_simplified
is_traditional = hanzidentifier.is_traditional
has_chinese = hanzidentifier.has_chinese

_READING_SEPARATOR = "/"


def _load_data():
    """Load the word and character mapping data into a dictionary.

    In the data files, each line is formatted like this:
        HANZI   PINYIN_READING/PINYIN_READING

    So, lines need to be split by '\t' and then the Pinyin readings need to be
    split by '/'.

    """
    data = {}
    for name, file_name in (
        ("words", "hanzi_pinyin_words.tsv"),
        ("characters", "hanzi_pinyin_characters.tsv"),
    ):
        # Split the lines by tabs: [[hanzi, pinyin]...].
        lines = [
            line.split("\t") for line in dragonmapper.data.load_data_file(file_name)
        ]
        # Make a dictionary: {hanzi: [pinyin, pinyin]...}.
        data[name] = {hanzi: pinyin.split("/") for hanzi, pinyin in lines}
    return data


_HANZI_PINYIN_MAP = _load_data()
_CHARACTERS = _HANZI_PINYIN_MAP["characters"]
_WORDS = _HANZI_PINYIN_MAP["words"]


def _hanzi_to_pinyin(hanzi):
    """Return the Pinyin reading for a Chinese word.

    If the given string *hanzi* matches a CC-CEDICT word, the return value is
    formatted like this: [WORD_READING1, WORD_READING2, ...]

    If the given string *hanzi* doesn't match a CC-CEDICT word, the return
    value is formatted like this: [[CHAR_READING1, CHAR_READING2 ...], ...]

    When returning character readings, if a character wasn't recognized, the
    original character is returned, e.g. [[CHAR_READING1, ...], CHAR, ...]

    """
    try:
        return _HANZI_PINYIN_MAP["words"][hanzi]
    except KeyError:
        return [_CHARACTERS.get(character, character) for character in hanzi]


def _enclose_readings(container, readings):
    """Enclose a reading within a container, e.g. '[]'."""
    container_start, container_end = tuple(container)
    enclosed_readings = "{container_start}{readings}{container_end}".format(
        container_start=container_start, container_end=container_end, readings=readings
    )
    return enclosed_readings


[docs] def to_pinyin(s, delimiter=" ", all_readings=False, container="[]", accented=True): """Convert a string's Chinese characters to Pinyin readings. *s* is a string containing Chinese characters. *accented* is a boolean value indicating whether to return accented or numbered Pinyin readings. *delimiter* is the character used to indicate word boundaries in *s*. This is used to differentiate between words and characters so that a more accurate reading can be returned. *all_readings* is a boolean value indicating whether or not to return all possible readings in the case of words/characters that have multiple readings. *container* is a two character string that is used to enclose words/characters if *all_readings* is ``True``. The default ``'[]'`` is used like this: ``'[READING1/READING2]'``. Characters not recognized as Chinese are left untouched. """ hanzi = s pinyin = "" # Process the given string. while hanzi: # Get the next match in the given string. match = re.search("[^{}{}]+".format(delimiter, zhon.hanzi.punctuation), hanzi) # There are no more matches, but the string isn't finished yet. if match is None and hanzi: pinyin += hanzi break match_start, match_end = match.span() # Process the punctuation marks that occur before the match. if match_start > 0: pinyin += hanzi[0:match_start] # Get the Chinese word/character readings. readings = _hanzi_to_pinyin(match.group()) # Process the returned word readings. if match.group() in _WORDS: if all_readings: reading = _enclose_readings( container, _READING_SEPARATOR.join(readings) ) else: reading = readings[0] pinyin += reading # Process the returned character readings. else: # Process each character individually. for character in readings: # Don't touch unrecognized characters. if isinstance(character, str): pinyin += character # Format multiple readings. elif isinstance(character, list) and all_readings: pinyin += _enclose_readings( container, _READING_SEPARATOR.join(character) ) # Select and format the most common reading. elif isinstance(character, list) and not all_readings: # Add an apostrophe to separate syllables. if ( pinyin and character[0][0] in zhon.pinyin.vowels and pinyin[-1] in zhon.pinyin.lowercase ): pinyin += "'" pinyin += character[0] # Move ahead in the given string. hanzi = hanzi[match_end:] if accented: return pinyin else: return accented_to_numbered(pinyin)
[docs] def to_zhuyin(s, delimiter=" ", all_readings=False, container="[]"): """Convert a string's Chinese characters to Zhuyin readings. *s* is a string containing Chinese characters. *delimiter* is the character used to indicate word boundaries in *s*. This is used to differentiate between words and characters so that a more accurate reading can be returned. *all_readings* is a boolean value indicating whether or not to return all possible readings in the case of words/characters that have multiple readings. *container* is a two character string that is used to enclose words/characters if *all_readings* is ``True``. The default ``'[]'`` is used like this: ``'[READING1/READING2]'``. Characters not recognized as Chinese are left untouched. """ numbered_pinyin = to_pinyin(s, delimiter, all_readings, container, False) zhuyin = pinyin_to_zhuyin(numbered_pinyin) return zhuyin
[docs] def to_ipa(s, delimiter=" ", all_readings=False, container="[]"): """Convert a string's Chinese characters to IPA. *s* is a string containing Chinese characters. *delimiter* is the character used to indicate word boundaries in *s*. This is used to differentiate between words and characters so that a more accurate reading can be returned. *all_readings* is a boolean value indicating whether or not to return all possible readings in the case of words/characters that have multiple readings. *container* is a two character string that is used to enclose words/characters if *all_readings* is ``True``. The default ``'[]'`` is used like this: ``'[READING1/READING2]'``. Characters not recognized as Chinese are left untouched. """ numbered_pinyin = to_pinyin(s, delimiter, all_readings, container, False) ipa = pinyin_to_ipa(numbered_pinyin) return ipa