1 Star 0 Fork 0

XY.LongzzZ/chardet

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
test.py 5.91 KB
一键复制 编辑 原始数据 按行查看 历史
"""
Run chardet on a bunch of documents and see that we get the correct encodings.
:author: Dan Blanchard
:author: Ian Cordasco
"""
import textwrap
from difflib import ndiff
from os import listdir
from os.path import dirname, isdir, join, realpath, relpath, splitext
from pprint import pformat
try:
import hypothesis.strategies as st
from hypothesis import Verbosity, assume, given, settings
HAVE_HYPOTHESIS = True
except ImportError:
HAVE_HYPOTHESIS = False
import pytest
import chardet
from chardet.metadata.languages import LANGUAGES
# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
# retrain model.
MISSING_ENCODINGS = {
"iso-8859-2",
"iso-8859-6",
"windows-1250",
"windows-1254",
"windows-1256",
}
EXPECTED_FAILURES = {
"tests/iso-8859-7-greek/disabled.gr.xml",
"tests/iso-8859-9-turkish/_ude_1.txt",
"tests/iso-8859-9-turkish/_ude_2.txt",
"tests/iso-8859-9-turkish/divxplanet.com.xml",
"tests/iso-8859-9-turkish/subtitle.srt",
"tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt",
}
def gen_test_params():
"""Yields tuples of paths and encodings to use for test_encoding_detection"""
base_path = relpath(join(dirname(realpath(__file__)), "tests"))
for encoding in listdir(base_path):
path = join(base_path, encoding)
# Skip files in tests directory
if not isdir(path):
continue
# Remove language suffixes from encoding if pressent
encoding = encoding.lower()
for language in sorted(LANGUAGES.keys()):
postfix = "-" + language.lower()
if encoding.endswith(postfix):
encoding = encoding.rpartition(postfix)[0]
break
# Skip directories for encodings we don't handle yet.
if encoding in MISSING_ENCODINGS:
continue
# Test encoding detection for each file we have of encoding for
for file_name in listdir(path):
ext = splitext(file_name)[1].lower()
if ext not in [".html", ".txt", ".xml", ".srt"]:
continue
full_path = join(path, file_name)
test_case = full_path, encoding
if full_path in EXPECTED_FAILURES:
test_case = pytest.param(*test_case, marks=pytest.mark.xfail)
yield test_case
@pytest.mark.parametrize("file_name, encoding", gen_test_params())
def test_encoding_detection(file_name, encoding):
with open(file_name, "rb") as f:
input_bytes = f.read()
result = chardet.detect(input_bytes)
try:
expected_unicode = input_bytes.decode(encoding)
except LookupError:
expected_unicode = ""
try:
detected_unicode = input_bytes.decode(result["encoding"])
except (LookupError, UnicodeDecodeError, TypeError):
detected_unicode = ""
if result:
encoding_match = (result["encoding"] or "").lower() == encoding
else:
encoding_match = False
# Only care about mismatches that would actually result in different
# behavior when decoding
if not encoding_match and expected_unicode != detected_unicode:
wrapped_expected = "\n".join(textwrap.wrap(expected_unicode, 100)) + "\n"
wrapped_detected = "\n".join(textwrap.wrap(detected_unicode, 100)) + "\n"
diff = "".join(
list(
ndiff(
wrapped_expected.splitlines(True), wrapped_detected.splitlines(True)
)
)[:20]
)
all_encodings = chardet.detect_all(input_bytes, ignore_threshold=True)
else:
diff = ""
encoding_match = True
all_encodings = [result]
assert encoding_match, (
f"Expected {encoding}, but got {result} for {file_name}. First 20 "
f"lines of character differences: \n{diff}\n"
f"All encodings: {pformat(all_encodings)}"
)
if HAVE_HYPOTHESIS:
class JustALengthIssue(Exception):
pass
@pytest.mark.xfail
@given(
st.text(min_size=1),
st.sampled_from(
[
"ascii",
"utf-8",
"utf-16",
"utf-32",
"iso-8859-7",
"iso-8859-8",
"windows-1255",
]
),
st.randoms(),
)
@settings(max_examples=200)
def test_never_fails_to_detect_if_there_is_a_valid_encoding(txt, enc, rnd):
try:
data = txt.encode(enc)
except UnicodeEncodeError:
assume(False)
detected = chardet.detect(data)["encoding"]
if detected is None:
with pytest.raises(JustALengthIssue):
@given(st.text(), random=rnd)
@settings(verbosity=Verbosity.quiet, max_shrinks=0, max_examples=50)
def string_poisons_following_text(suffix):
try:
extended = (txt + suffix).encode(enc)
except UnicodeEncodeError:
assume(False)
result = chardet.detect(extended)
if result and result["encoding"] is not None:
raise JustALengthIssue()
@given(
st.text(min_size=1),
st.sampled_from(
[
"ascii",
"utf-8",
"utf-16",
"utf-32",
"iso-8859-7",
"iso-8859-8",
"windows-1255",
]
),
st.randoms(),
)
@settings(max_examples=200)
def test_detect_all_and_detect_one_should_agree(txt, enc, rnd):
try:
data = txt.encode(enc)
except UnicodeEncodeError:
assume(False)
try:
result = chardet.detect(data)
results = chardet.detect_all(data)
assert result["encoding"] == results[0]["encoding"]
except Exception:
raise Exception(f"{result} != {results}")
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/XYLongzzZ/chardet.git
git@gitee.com:XYLongzzZ/chardet.git
XYLongzzZ
chardet
chardet
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385