Source code for hanzidentifier.core
# -*- coding: utf-8 -*-
from hanzidentifier import helpers
UNKNOWN = 0
TRAD = TRADITIONAL = 1
SIMP = SIMPLIFIED = 2
BOTH = 3
MIXED = 4
[docs]
def identify(s):
"""Identify what kind of Chinese characters a string contains.
*s* is a string to examine. The string's Chinese characters are tested to
see if they are compatible with the Traditional or Simplified characters
systems, compatible with both, or contain a mixture of Traditional and
Simplified characters. The :data:`TRADITIONAL`, :data:`SIMPLIFIED`,
:data:`BOTH`, or :data:`MIXED` constants are returned to indicate the
string's identity. If *s* contains no Chinese characters, then
:data:`UNKNOWN` is returned.
All characters in a string that aren't found in the CC-CEDICT dictionary
are ignored.
Because the Traditional and Simplified Chinese character systems overlap, a
string containing Simplified characters could identify as
:data:`SIMPLIFIED` or :data:`BOTH` depending on if the characters are also
Traditional characters. To make testing the identity of a string easier,
the functions :func:`is_traditional`, :func:`is_simplified`, and
:func:`has_chinese` are provided.
"""
chinese = helpers.get_hanzi(s)
if not chinese:
return UNKNOWN
if chinese.issubset(helpers.SHARED_CHARACTERS):
return BOTH
if chinese.issubset(helpers.TRADITIONAL_CHARACTERS):
return TRADITIONAL
if chinese.issubset(helpers.SIMPLIFIED_CHARACTERS):
return SIMPLIFIED
return MIXED
[docs]
def has_chinese(s):
"""Check if a string has Chinese characters in it.
This is a faster version of:
>>> identify('foo') is not UNKNOWN
"""
return bool(helpers.get_hanzi(s))
[docs]
def is_traditional(s):
"""Check if a string's Chinese characters are Traditional.
This is equivalent to:
>>> identify('foo') in (TRADITIONAL, BOTH)
"""
chinese = helpers.get_hanzi(s)
if not chinese:
return False
if chinese.issubset(helpers.SHARED_CHARACTERS):
return True
if chinese.issubset(helpers.TRADITIONAL_CHARACTERS):
return True
return False
[docs]
def is_simplified(s):
"""Check if a string's Chinese characters are Simplified.
This is equivalent to:
>>> identify('foo') in (SIMPLIFIED, BOTH)
"""
chinese = helpers.get_hanzi(s)
if not chinese:
return False
if chinese.issubset(helpers.SHARED_CHARACTERS):
return True
if chinese.issubset(helpers.SIMPLIFIED_CHARACTERS):
return True
return False
def count_chinese(s: str) -> int:
"""count how many chinese exist in a string"""
result = 0
for i in s:
if has_chinese(i):
result += 1
return result