1 Star 0 Fork 0

Jurge/chardet

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
bench.py 4.75 KB
一键复制 编辑 原始数据 按行查看 历史
Dan Blanchard 提交于 2020-12-10 16:29 . Run black
"""
Run chardet on a bunch of documents and see that we get the correct encodings.
:author: Dan Blanchard
:author: Ian Cordasco
"""
import argparse
import sys
import time
from collections import defaultdict
from os import listdir
from os.path import dirname, isdir, join, realpath, relpath, splitext
import chardet
try:
import cchardet
HAVE_CCHARDET = True
except:
HAVE_CCHARDET = False
# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
# retrain model.
MISSING_ENCODINGS = {
"iso-8859-2",
"iso-8859-6",
"windows-1250",
"windows-1254",
"windows-1256",
}
EXPECTED_FAILURES = {
"tests/iso-8859-7-greek/disabled.gr.xml",
"tests/iso-8859-9-turkish/divxplanet.com.xml",
"tests/iso-8859-9-turkish/subtitle.srt",
"tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt",
}
def get_py_impl():
"""Return what kind of Python this is"""
if hasattr(sys, "pypy_version_info"):
pyimpl = "PyPy"
elif sys.platform.startswith("java"):
pyimpl = "Jython"
elif sys.platform == "cli":
pyimpl = "IronPython"
else:
pyimpl = "CPython"
return pyimpl
def get_test_files():
"""Yields filenames to use for timing chardet.detect"""
base_path = relpath(join(dirname(realpath(__file__)), "tests"))
for encoding in listdir(base_path):
path = join(base_path, encoding)
# Skip files in tests directory
if not isdir(path):
continue
# Remove language suffixes from encoding if pressent
encoding = encoding.lower()
for postfix in [
"-arabic",
"-bulgarian",
"-cyrillic",
"-greek",
"-hebrew",
"-hungarian",
"-turkish",
]:
if encoding.endswith(postfix):
encoding = encoding.rpartition(postfix)[0]
break
# Skip directories for encodings we don't handle yet.
if encoding in MISSING_ENCODINGS:
continue
# Test encoding detection for each file we have of encoding for
for file_name in listdir(path):
ext = splitext(file_name)[1].lower()
if ext not in [".html", ".txt", ".xml", ".srt"]:
continue
full_path = join(path, file_name)
if full_path in EXPECTED_FAILURES:
continue
yield full_path, encoding
def benchmark(chardet_mod=chardet, verbose=False, num_iters=10):
print(
f"Benchmarking {chardet_mod.__name__} {chardet_mod.__version__} "
f"on {get_py_impl()} {sys.version}"
)
print("-" * 80)
total_time = 0
num_files = 0
encoding_times = defaultdict(float)
encoding_num_files = defaultdict(int)
for full_path, encoding in get_test_files():
num_files += 1
with open(full_path, "rb") as f:
input_bytes = f.read()
start = time.time()
for _ in range(num_iters):
chardet_mod.detect(input_bytes)
bench_time = time.time() - start
if verbose:
print(f"Average time for {full_path}: {bench_time / num_iters}s")
else:
print(".", end="")
sys.stdout.flush()
total_time += bench_time
encoding_times[encoding] += bench_time
encoding_num_files[encoding] += 1
print("\nCalls per second for each encoding:")
for encoding in sorted(encoding_times.keys()):
calls_per_sec = (
num_iters * encoding_num_files[encoding] / encoding_times[encoding]
)
print(f"{encoding}: {calls_per_sec}")
calls_per_sec = num_iters * num_files / total_time
print(f"\nTotal time: {total_time}s ({calls_per_sec} calls per second)")
def main():
parser = argparse.ArgumentParser(
description="Times how long it takes to process each file in test set "
"multiple times.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"-c",
"--cchardet",
action="store_true",
help="Run benchmarks for cChardet instead of chardet, " "if it is installed.",
)
parser.add_argument(
"-i",
"--iterations",
help="Number of times to process each file",
type=int,
default=10,
)
parser.add_argument(
"-v",
"--verbose",
help="Prints out the timing for each individual file.",
action="store_true",
)
args = parser.parse_args()
if args.cchardet and not HAVE_CCHARDET:
print("You must pip install cchardet if you want to benchmark it.")
sys.exit(1)
benchmark(
chardet_mod=cchardet if args.cchardet else chardet,
verbose=args.verbose,
num_iters=args.iterations,
)
if __name__ == "__main__":
main()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/jurge/chardet.git
git@gitee.com:jurge/chardet.git
jurge
chardet
chardet
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385