Skip to content

Commit

Permalink
Merge pull request #229 from openzim/i18n_class
Browse files Browse the repository at this point in the history
Move to real classes for i18n classes for proper typing in strict mode
  • Loading branch information
benoit74 authored Dec 20, 2024
2 parents f68d568 + 67844f4 commit b7b5e46
Show file tree
Hide file tree
Showing 2 changed files with 359 additions and 242 deletions.
357 changes: 173 additions & 184 deletions src/zimscraperlib/i18n.py
Original file line number Diff line number Diff line change
@@ -1,207 +1,196 @@
#!/usr/bin/env python3
# vim: ai ts=4 sts=4 et sw=4 nu

from __future__ import annotations

import re

import babel
import iso639
import iso639.exceptions
import iso639 # pyright: ignore[reportMissingTypeStubs]
import iso639.exceptions # pyright: ignore[reportMissingTypeStubs]

ISO_LEVELS = ["1", "2b", "2t", "3", "5"]


class NotFoundError(ValueError):
pass


class Lang(dict):

@property
def iso_639_1(self) -> str | None:
"""ISO-639-1 language code"""
return self["iso-639-1"]

@property
def iso_639_2b(self) -> str | None:
"""ISO-639-2b language code"""
return self["iso-639-2b"]

@property
def iso_639_2t(self) -> str | None:
"""ISO-639-2t language code"""
return self["iso-639-2t"]

@property
def iso_639_3(self) -> str | None:
"""ISO-639-3 language code"""
return self["iso-639-3"]

@property
def iso_639_5(self) -> str | None:
"""ISO-639-5 language code"""
return self["iso-639-5"]

@property
def english(self) -> str:
"""language name in English"""
return self["english"]

@property
def native(self) -> str:
"""language name in native language"""
return self["native"]

@property
def iso_types(self) -> list[str]:
"""list of supported iso types"""
return self["iso_types"]

@property
def query(self) -> str:
"""Query issued for these language details"""
return self["query"]

@property
def querytype(self) -> str:
"""Type of query issued to retrieve language details"""
return self["querytype"]


def get_iso_lang_data(lang: str) -> tuple[Lang, Lang | None]:
"""ISO-639-x languages details for lang. Raises NotFoundError
class NotFoundError(ValueError): ...


class Language:
"""Qualified ISO-639-3 language"""

def __init__(self, query: str):
"""Instantiate a valid ISO-639-3 Language from query
params: either an ISO-639 code or a locale or an english language name"""
self.iso_639_1: str | None = None
self.iso_639_2b: str | None = None
self.iso_639_2t: str | None = None
self.iso_639_3: str | None = None
self.iso_639_5: str | None = None
self.english: str | None = None
self.native: str | None = None
self.iso_types: list[str] = []
self.query: str = query
self.native_query: str | None = None
self.querytype: str | None = None

def get_adjusted_query(query: str) -> tuple[str, str, str]:
# possibily an iso-639 code
if query.isalpha() and (2 <= len(query) <= 3): # noqa: PLR2004
adjusted_query = query
native_query = query
query_type = "purecode"
# possibily a locale
elif all(x.isalpha() or x in ("-", "_") for x in query) and (
query.count("_") + query.count("-") == 1
):
adjusted_query = re.split("-|_", query)[0]
native_query = query.replace("-", "_")
query_type = "locale"
# possibily an ISO language name
else:
adjusted_query = query.title().replace("Languages", "languages")
native_query = query
query_type = "languagename"
return adjusted_query, native_query, query_type

adjusted_query, self.native_query, self.querytype = get_adjusted_query(query)

Returns a tuple (main_language, macro_language | None)"""

iso_types = []

try:
isolang = iso639.Lang(lang)
except (
iso639.exceptions.InvalidLanguageValue,
iso639.exceptions.DeprecatedLanguageValue,
) as exc:
raise NotFoundError("Not a valid iso language name/code") from exc

def replace_types(new_type: str) -> str:
# convert new iso_types from iso639-lang Pypi package to old iso_types from
# iso-639 package, since we were returning these values for a long time
if new_type == "pt1":
return "part1"
elif new_type == "pt2b":
return "part2b"
elif new_type == "pt2t":
return "part2t"
elif new_type == "pt3":
return "part3"
elif new_type == "pt5":
return "part5"
return new_type

for code_type in [f"pt{lang_}" for lang_ in ISO_LEVELS] + ["name"]:
# the `if` condition below is a bit hackish but it is the only way to know
# if the passed value is matching a code type or not with new python-i639
# library and we do not expect weird things to happen here
if str(getattr(isolang, code_type)).lower() == lang.lower():
iso_types.append(replace_types(code_type))

lang_data = Lang(
**{f"iso-639-{lang_}": getattr(isolang, f"pt{lang_}") for lang_ in ISO_LEVELS}
)
lang_data.update({"english": isolang.name, "iso_types": iso_types})

# first item in the returned tuple
macro = isolang.macro()
return (lang_data, get_iso_lang_data(macro.name)[0] if macro else None)


def find_language_names(query: str, lang_data: Lang | None = None) -> tuple[str, str]:
"""(native, english) language names for lang with help from lang_data
Falls back to English name if available or query if not"""
if lang_data is None:
lang_data = get_language_details(query, failsafe=True)
if not lang_data:
return query, query
try:
isolang = iso639.Lang(adjusted_query)
except (
iso639.exceptions.InvalidLanguageValue,
iso639.exceptions.DeprecatedLanguageValue,
) as exc:
raise NotFoundError("Not a valid iso language name/code") from exc

parts_keys_map = {
"iso_639_1": "pt1",
"iso_639_2b": "pt2b",
"iso_639_2t": "pt2t",
"iso_639_3": "pt3",
"iso_639_5": "pt5",
"english": "name",
}

try:
query_locale = babel.Locale.parse(query)
if native_display_name := query_locale.get_display_name():
if english_display_name := query_locale.get_display_name("en"):
return native_display_name, english_display_name
except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError):
pass

# ISO code lookup order matters (most qualified first)!
for iso_level in [f"iso-639-{lang_}" for lang_ in reversed(ISO_LEVELS)]:
self.iso_639_1 = isolang.pt1 or None
self.iso_639_2b = isolang.pt2b or None
self.iso_639_2t = isolang.pt2t or None
self.iso_639_3 = isolang.pt3 or None
self.iso_639_5 = isolang.pt5 or None
self.english = isolang.name or None
self.iso_types = [
part_level
for iso_level, part_level in [
(f"pt{level}", f"part{level}") for level in ISO_LEVELS
]
+ [("name", "name")]
if getattr(isolang, iso_level).lower() == adjusted_query.lower()
]

# update if language has a macro
if isolang.macro():
for iso_level in [f"iso_639_{level}" for level in ISO_LEVELS]:
if not getattr(self, iso_level):
setattr(
self,
iso_level,
# we'll get the pt attr for each iso_xxx
getattr(isolang.macro(), parts_keys_map[iso_level], None)
# we want None if value is empty
or None,
)

self.native, self.english = self._get_names_from(self.native_query)

def _get_names_from(self, query: str) -> tuple[str, str]:
"""logic to find language names from babel and fallback"""
try:
query_locale = babel.Locale.parse(lang_data.get(iso_level))
query_locale = babel.Locale.parse(query)
if native_display_name := query_locale.get_display_name():
if english_display_name := query_locale.get_display_name("en"):
return native_display_name, english_display_name
except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError):
pass
default = lang_data.get("english") or query
return default, default


def update_with_macro(lang_data: Lang, macro_data: Lang | None):
"""update empty keys from lang_data with ones of macro_data"""
if macro_data:
for key, value in macro_data.items():
if key in lang_data and not lang_data.get(key):
lang_data[key] = value
return lang_data


def get_language_details(
query: str, failsafe: bool | None = False # noqa: FBT002
) -> Lang | None:
"""language details dict from query.
When query fails, either raises NotFoundError or return None, based on failsafe
"""

if query.isalpha() and (2 <= len(query) <= 3): # noqa: PLR2004
# possibility of iso-639 code
adjusted_query = query
native_query = query
query_type = "purecode"
elif all(x.isalpha() or x in ("-", "_") for x in query) and (
query.count("_") + query.count("-") == 1
):
# possibility of locale
adjusted_query = re.split("-|_", query)[0]
native_query = query.replace("-", "_")
query_type = "locale"
else:
# possibility of iso language name
adjusted_query = query.title().replace("Languages", "languages")
native_query = query
query_type = "languagename"

try:
lang_data, macro_data = get_iso_lang_data(adjusted_query)
except NotFoundError as exc:
if failsafe:
return None
raise exc

iso_data = update_with_macro(lang_data, macro_data)
native_name, english_name = find_language_names(native_query, iso_data)
iso_data.update(
{
"english": english_name,
"native": native_name,
"querytype": query_type,
"query": query,
# ISO code lookup order matters (most qualified first)!
for iso_level in [f"iso_639_{level}" for level in reversed(ISO_LEVELS)]:
try:
query_locale = babel.Locale.parse(getattr(self, iso_level))
if native_display_name := query_locale.get_display_name():
if english_display_name := query_locale.get_display_name("en"):
return native_display_name, english_display_name
except (
babel.UnknownLocaleError,
TypeError,
ValueError,
AttributeError,
):
pass
default = self.english or query
return default, default

def todict(self) -> dict[str, str | None | list[str]]:
return {
key.replace("_", "-") if key.startswith("iso") else key: getattr(
self, key, None
)
for key in [
"iso_639_1",
"iso_639_2b",
"iso_639_2t",
"iso_639_3",
"iso_639_5",
"english",
"iso_types",
"native",
"querytype",
"query",
]
}
)
return iso_data

def __repr__(self) -> str:
data_repr = ", ".join(
f'{key.replace("-", "_")}="{value}"' for key, value in self.todict().items()
)
return f"{type(self).__name__}({data_repr})"

def __str__(self) -> str:
return f"{self.iso_639_3}: {self.english}"

def __eq__(self, value: object) -> bool:
return (
self.iso_639_1 == getattr(value, "iso_639_1", None)
and self.iso_639_2b == getattr(value, "iso_639_2b", None)
and self.iso_639_2t == getattr(value, "iso_639_2t", None)
and self.iso_639_3 == getattr(value, "iso_639_3", None)
and self.iso_639_5 == getattr(value, "iso_639_5", None)
and self.english == getattr(value, "english", None)
and self.native == getattr(value, "native", None)
)


def find_language_names(query: str) -> tuple[str, str]:
"""(native, english) language names for query"""
try:
lang = Language(query)
except NotFoundError:
return query, query
# should be qualified but "None" is as valid as anything if not
return str(lang.native), str(lang.english)


def get_language(lang_code: str) -> Language:
"""Language from lang_code"""
return Language(lang_code)


def get_language_or_none(lang_code: str) -> Language | None:
"""Language from lang_code or None if not found"""
try:
return get_language(lang_code)
except NotFoundError:
return None


def is_valid_iso_639_3(code: str) -> bool:
"""whether code is a valid ISO-639-3 code"""
return (get_language_details(code, failsafe=True) or {}).get("iso-639-3") == code
lang = get_language_or_none(code)
return lang is not None and lang.iso_639_3 == code
Loading

0 comments on commit b7b5e46

Please sign in to comment.