Browse Source

Fixing some performance bottlenecks (#183)

* small performance correction
pull/188/head
deedy5 2 months ago committed by GitHub
parent
commit
d50cd847ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 17
      charset_normalizer/cd.py
  2. 10
      charset_normalizer/md.py
  3. 10
      charset_normalizer/utils.py

17
charset_normalizer/cd.py

@ -175,9 +175,10 @@ def characters_popularity_compare(
raise ValueError("{} not available".format(language))
character_approved_count = 0 # type: int
FREQUENCIES_language_set = set(FREQUENCIES[language])
for character in ordered_characters:
if character not in FREQUENCIES[language]:
if character not in FREQUENCIES_language_set:
continue
characters_before_source = FREQUENCIES[language][
@ -186,7 +187,6 @@ def characters_popularity_compare(
characters_after_source = FREQUENCIES[language][
FREQUENCIES[language].index(character) :
] # type: List[str]
characters_before = ordered_characters[
0 : ordered_characters.index(character)
] # type: List[str]
@ -194,15 +194,12 @@ def characters_popularity_compare(
ordered_characters.index(character) :
] # type: List[str]
before_match_count = [
e in characters_before for e in characters_before_source
].count(
True
before_match_count = len(
set(characters_before) & set(characters_before_source)
) # type: int
after_match_count = [
e in characters_after for e in characters_after_source
].count(
True
after_match_count = len(
set(characters_after) & set(characters_after_source)
) # type: int
if len(characters_before_source) == 0 and before_match_count <= 4:

10
charset_normalizer/md.py

@ -16,6 +16,7 @@ from .utils import (
is_separator,
is_symbol,
is_thai,
is_unprintable,
remove_accent,
unicode_range,
)
@ -139,11 +140,7 @@ class UnprintablePlugin(MessDetectorPlugin):
return True
def feed(self, character: str) -> None:
if (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
):
if is_unprintable(character):
self._unprintable_count += 1
self._character_count += 1
@ -269,7 +266,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
def feed(self, character: str) -> None:
if character.isalpha():
self._buffer = "".join([self._buffer, character])
self._buffer += character
if is_accentuated(character):
self._buffer_accent_count += 1
if (
@ -446,6 +443,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
return self._successive_upper_lower_count_final / self._character_count
@lru_cache(maxsize=1024)
def is_suspiciously_successive_range(
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
) -> bool:

10
charset_normalizer/utils.py

@ -73,6 +73,7 @@ def is_latin(character: str) -> bool:
return "LATIN" in description
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_ascii(character: str) -> bool:
try:
character.encode("ascii")
@ -197,6 +198,15 @@ def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_unprintable(character: str) -> bool:
return (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
)
def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
"""
Extract using ASCII-only decoder any specified encoding in the first n-bytes.

Loading…
Cancel
Save