master

分支 (7)

标签 (10)

管理

管理

master

feature/minimal_cli

feature/retrain_sbcs_models

jpz-improved_utf_detection

stable

feature/chunk_size

feature/uchardet-enhanced-upstream

4.0.0

3.0.4

3.0.3

3.0.2

3.0.1

3.0.0

2.3.0

2.2.1

2.2.0

1.1

chardet
/
bench.py

"""
Run chardet on a bunch of documents and see that we get the correct encodings.

:author: Dan Blanchard
:author: Ian Cordasco
"""


import argparse
import sys
import time
from collections import defaultdict
from os import listdir
from os.path import dirname, isdir, join, realpath, relpath, splitext

import chardet

try:
    import cchardet

    HAVE_CCHARDET = True
except:
    HAVE_CCHARDET = False


# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
#       retrain model.
MISSING_ENCODINGS = {
    "iso-8859-2",
    "iso-8859-6",
    "windows-1250",
    "windows-1254",
    "windows-1256",
}
EXPECTED_FAILURES = {
    "tests/iso-8859-7-greek/disabled.gr.xml",
    "tests/iso-8859-9-turkish/divxplanet.com.xml",
    "tests/iso-8859-9-turkish/subtitle.srt",
    "tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt",
}


def get_py_impl():
    """Return what kind of Python this is"""
    if hasattr(sys, "pypy_version_info"):
        pyimpl = "PyPy"
    elif sys.platform.startswith("java"):
        pyimpl = "Jython"
    elif sys.platform == "cli":
        pyimpl = "IronPython"
    else:
        pyimpl = "CPython"
    return pyimpl


def get_test_files():
    """Yields filenames to use for timing chardet.detect"""
    base_path = relpath(join(dirname(realpath(__file__)), "tests"))
    for encoding in listdir(base_path):
        path = join(base_path, encoding)
        # Skip files in tests directory
        if not isdir(path):
            continue
        # Remove language suffixes from encoding if pressent
        encoding = encoding.lower()
        for postfix in [
            "-arabic",
            "-bulgarian",
            "-cyrillic",
            "-greek",
            "-hebrew",
            "-hungarian",
            "-turkish",
        ]:
            if encoding.endswith(postfix):
                encoding = encoding.rpartition(postfix)[0]
                break
        # Skip directories for encodings we don't handle yet.
        if encoding in MISSING_ENCODINGS:
            continue
        # Test encoding detection for each file we have of encoding for
        for file_name in listdir(path):
            ext = splitext(file_name)[1].lower()
            if ext not in [".html", ".txt", ".xml", ".srt"]:
                continue
            full_path = join(path, file_name)
            if full_path in EXPECTED_FAILURES:
                continue
            yield full_path, encoding


def benchmark(chardet_mod=chardet, verbose=False, num_iters=10):
    print(
        f"Benchmarking {chardet_mod.__name__} {chardet_mod.__version__} "
        f"on {get_py_impl()} {sys.version}"
    )
    print("-" * 80)
    total_time = 0
    num_files = 0
    encoding_times = defaultdict(float)
    encoding_num_files = defaultdict(int)
    for full_path, encoding in get_test_files():
        num_files += 1
        with open(full_path, "rb") as f:
            input_bytes = f.read()
        start = time.time()
        for _ in range(num_iters):
            chardet_mod.detect(input_bytes)
        bench_time = time.time() - start
        if verbose:
            print(f"Average time for {full_path}: {bench_time / num_iters}s")
        else:
            print(".", end="")
            sys.stdout.flush()
        total_time += bench_time
        encoding_times[encoding] += bench_time
        encoding_num_files[encoding] += 1

    print("\nCalls per second for each encoding:")
    for encoding in sorted(encoding_times.keys()):
        calls_per_sec = (
            num_iters * encoding_num_files[encoding] / encoding_times[encoding]
        )
        print(f"{encoding}: {calls_per_sec}")
    calls_per_sec = num_iters * num_files / total_time
    print(f"\nTotal time: {total_time}s ({calls_per_sec} calls per second)")


def main():
    parser = argparse.ArgumentParser(
        description="Times how long it takes to process each file in test set "
        "multiple times.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "-c",
        "--cchardet",
        action="store_true",
        help="Run benchmarks for cChardet instead of chardet, " "if it is installed.",
    )
    parser.add_argument(
        "-i",
        "--iterations",
        help="Number of times to process each file",
        type=int,
        default=10,
    )
    parser.add_argument(
        "-v",
        "--verbose",
        help="Prints out the timing for each individual file.",
        action="store_true",
    )
    args = parser.parse_args()

    if args.cchardet and not HAVE_CCHARDET:
        print("You must pip install cchardet if you want to benchmark it.")
        sys.exit(1)

    benchmark(
        chardet_mod=cchardet if args.cchardet else chardet,
        verbose=args.verbose,
        num_iters=args.iterations,
    )


if __name__ == "__main__":
    main()