Source code for hanzidentifier.core

# -*- coding: utf-8 -*-
from hanzidentifier import helpers

UNKNOWN = 0
TRAD = TRADITIONAL = 1
SIMP = SIMPLIFIED = 2
BOTH = 3
MIXED = 4



[docs]
def identify(s):
    """Identify what kind of Chinese characters a string contains.

    *s* is a string to examine. The string's Chinese characters are tested to
    see if they are compatible with the Traditional or Simplified characters
    systems, compatible with both, or contain a mixture of Traditional and
    Simplified characters. The :data:`TRADITIONAL`, :data:`SIMPLIFIED`,
    :data:`BOTH`, or :data:`MIXED` constants are returned to indicate the
    string's identity. If *s* contains no Chinese characters, then
    :data:`UNKNOWN` is returned.

    All characters in a string that aren't found in the CC-CEDICT dictionary
    are ignored.

    Because the Traditional and Simplified Chinese character systems overlap, a
    string containing Simplified characters could identify as
    :data:`SIMPLIFIED` or :data:`BOTH` depending on if the characters are also
    Traditional characters. To make testing the identity of a string easier,
    the functions :func:`is_traditional`, :func:`is_simplified`, and
    :func:`has_chinese` are provided.

    """
    chinese = helpers.get_hanzi(s)
    if not chinese:
        return UNKNOWN
    if chinese.issubset(helpers.SHARED_CHARACTERS):
        return BOTH
    if chinese.issubset(helpers.TRADITIONAL_CHARACTERS):
        return TRADITIONAL
    if chinese.issubset(helpers.SIMPLIFIED_CHARACTERS):
        return SIMPLIFIED
    return MIXED




[docs]
def has_chinese(s):
    """Check if a string has Chinese characters in it.

    This is a faster version of:
        >>> identify('foo') is not UNKNOWN

    """
    return bool(helpers.get_hanzi(s))




[docs]
def is_traditional(s):
    """Check if a string's Chinese characters are Traditional.

    This is equivalent to:
        >>> identify('foo') in (TRADITIONAL, BOTH)

    """
    chinese = helpers.get_hanzi(s)
    if not chinese:
        return False
    if chinese.issubset(helpers.SHARED_CHARACTERS):
        return True
    if chinese.issubset(helpers.TRADITIONAL_CHARACTERS):
        return True
    return False




[docs]
def is_simplified(s):
    """Check if a string's Chinese characters are Simplified.

    This is equivalent to:
        >>> identify('foo') in (SIMPLIFIED, BOTH)

    """
    chinese = helpers.get_hanzi(s)
    if not chinese:
        return False
    if chinese.issubset(helpers.SHARED_CHARACTERS):
        return True
    if chinese.issubset(helpers.SIMPLIFIED_CHARACTERS):
        return True
    return False



def count_chinese(s: str) -> int:
    """count how many chinese exist in a string"""
    result = 0
    for i in s:
        if has_chinese(i):
            result += 1
    return result
Source code for hanzidentifier.core

Dragon Mapper

Navigation

Related Topics