Source code for dragonmapper.hanzi

# -*- coding: utf-8 -*-
"""Identification and transliteration functions for Chinese characters."""

import re

import hanzidentifier
import zhon.hanzi
import zhon.pinyin

import dragonmapper.data
from dragonmapper.transcriptions import (
    accented_to_numbered,
    pinyin_to_ipa,
    pinyin_to_zhuyin,
)

UNKNOWN = hanzidentifier.UNKNOWN
BOTH = hanzidentifier.BOTH
MIXED = hanzidentifier.MIXED
TRAD = TRADITIONAL = hanzidentifier.TRADITIONAL
SIMP = SIMPLIFIED = hanzidentifier.SIMPLIFIED
identify = hanzidentifier.identify
is_simplified = hanzidentifier.is_simplified
is_traditional = hanzidentifier.is_traditional
has_chinese = hanzidentifier.has_chinese

_READING_SEPARATOR = "/"


def _load_data():
    """Load the word and character mapping data into a dictionary.

    In the data files, each line is formatted like this:
        HANZI   PINYIN_READING/PINYIN_READING

    So, lines need to be split by '\t' and then the Pinyin readings need to be
    split by '/'.

    """
    data = {}
    for name, file_name in (
        ("words", "hanzi_pinyin_words.tsv"),
        ("characters", "hanzi_pinyin_characters.tsv"),
    ):
        # Split the lines by tabs: [[hanzi, pinyin]...].
        lines = [
            line.split("\t") for line in dragonmapper.data.load_data_file(file_name)
        ]
        # Make a dictionary: {hanzi: [pinyin, pinyin]...}.
        data[name] = {hanzi: pinyin.split("/") for hanzi, pinyin in lines}
    return data


_HANZI_PINYIN_MAP = _load_data()
_CHARACTERS = _HANZI_PINYIN_MAP["characters"]
_WORDS = _HANZI_PINYIN_MAP["words"]


def _hanzi_to_pinyin(hanzi):
    """Return the Pinyin reading for a Chinese word.

    If the given string *hanzi* matches a CC-CEDICT word, the return value is
    formatted like this: [WORD_READING1, WORD_READING2, ...]

    If the given string *hanzi* doesn't match a CC-CEDICT word, the return
    value is formatted like this: [[CHAR_READING1, CHAR_READING2 ...], ...]

    When returning character readings, if a character wasn't recognized, the
    original character is returned, e.g. [[CHAR_READING1, ...], CHAR, ...]

    """
    try:
        return _HANZI_PINYIN_MAP["words"][hanzi]
    except KeyError:
        return [_CHARACTERS.get(character, character) for character in hanzi]


def _enclose_readings(container, readings):
    """Enclose a reading within a container, e.g. '[]'."""
    container_start, container_end = tuple(container)
    enclosed_readings = "{container_start}{readings}{container_end}".format(
        container_start=container_start, container_end=container_end, readings=readings
    )
    return enclosed_readings



[docs]
def to_pinyin(s, delimiter=" ", all_readings=False, container="[]", accented=True):
    """Convert a string's Chinese characters to Pinyin readings.

    *s* is a string containing Chinese characters. *accented* is a
    boolean value indicating whether to return accented or numbered Pinyin
    readings.

    *delimiter* is the character used to indicate word boundaries in *s*.
    This is used to differentiate between words and characters so that a more
    accurate reading can be returned.

    *all_readings* is a boolean value indicating whether or not to return all
    possible readings in the case of words/characters that have multiple
    readings. *container* is a two character string that is used to
    enclose words/characters if *all_readings* is ``True``. The default
    ``'[]'`` is used like this: ``'[READING1/READING2]'``.

    Characters not recognized as Chinese are left untouched.

    """
    hanzi = s
    pinyin = ""

    # Process the given string.
    while hanzi:
        # Get the next match in the given string.
        match = re.search("[^{}{}]+".format(delimiter, zhon.hanzi.punctuation), hanzi)

        # There are no more matches, but the string isn't finished yet.
        if match is None and hanzi:
            pinyin += hanzi
            break

        match_start, match_end = match.span()

        # Process the punctuation marks that occur before the match.
        if match_start > 0:
            pinyin += hanzi[0:match_start]

        # Get the Chinese word/character readings.
        readings = _hanzi_to_pinyin(match.group())

        # Process the returned word readings.
        if match.group() in _WORDS:
            if all_readings:
                reading = _enclose_readings(
                    container, _READING_SEPARATOR.join(readings)
                )
            else:
                reading = readings[0]
            pinyin += reading

        # Process the returned character readings.
        else:
            # Process each character individually.
            for character in readings:
                # Don't touch unrecognized characters.
                if isinstance(character, str):
                    pinyin += character
                # Format multiple readings.
                elif isinstance(character, list) and all_readings:
                    pinyin += _enclose_readings(
                        container, _READING_SEPARATOR.join(character)
                    )
                # Select and format the most common reading.
                elif isinstance(character, list) and not all_readings:
                    # Add an apostrophe to separate syllables.
                    if (
                        pinyin
                        and character[0][0] in zhon.pinyin.vowels
                        and pinyin[-1] in zhon.pinyin.lowercase
                    ):
                        pinyin += "'"
                    pinyin += character[0]

        # Move ahead in the given string.
        hanzi = hanzi[match_end:]

    if accented:
        return pinyin
    else:
        return accented_to_numbered(pinyin)




[docs]
def to_zhuyin(s, delimiter=" ", all_readings=False, container="[]"):
    """Convert a string's Chinese characters to Zhuyin readings.

    *s* is a string containing Chinese characters.

    *delimiter* is the character used to indicate word boundaries in *s*.
    This is used to differentiate between words and characters so that a more
    accurate reading can be returned.

    *all_readings* is a boolean value indicating whether or not to return all
    possible readings in the case of words/characters that have multiple
    readings. *container* is a two character string that is used to
    enclose words/characters if *all_readings* is ``True``. The default
    ``'[]'`` is used like this: ``'[READING1/READING2]'``.

    Characters not recognized as Chinese are left untouched.

    """
    numbered_pinyin = to_pinyin(s, delimiter, all_readings, container, False)
    zhuyin = pinyin_to_zhuyin(numbered_pinyin)
    return zhuyin




[docs]
def to_ipa(s, delimiter=" ", all_readings=False, container="[]"):
    """Convert a string's Chinese characters to IPA.

    *s* is a string containing Chinese characters.

    *delimiter* is the character used to indicate word boundaries in *s*.
    This is used to differentiate between words and characters so that a more
    accurate reading can be returned.

    *all_readings* is a boolean value indicating whether or not to return all
    possible readings in the case of words/characters that have multiple
    readings. *container* is a two character string that is used to
    enclose words/characters if *all_readings* is ``True``. The default
    ``'[]'`` is used like this: ``'[READING1/READING2]'``.

    Characters not recognized as Chinese are left untouched.

    """
    numbered_pinyin = to_pinyin(s, delimiter, all_readings, container, False)
    ipa = pinyin_to_ipa(numbered_pinyin)
    return ipa
Source code for dragonmapper.hanzi

Dragon Mapper

Navigation

Related Topics