Language code handling
This library helps to normalise the ISO 639 codes used to describe languages from
two-letter codes to three letters, and vice versa.
import rigour.langs as languagecodes
assert 'eng' == languagecodes.iso_639_alpha3('en')
assert 'eng' == languagecodes.iso_639_alpha3('ENG ')
assert 'en' == languagecodes.iso_639_alpha2('ENG ')
Uses data from: https://iso639-3.sil.org/
See also: https://www.loc.gov/standards/iso639-2/php/code_list.php
LangStr
Bases: str
A type of string that include language metadata. This is useful for handling multilingual content.
The class does not override any operators and functions, which means they will behave like a regular string.
Source code in rigour/langs/text.py
| class LangStr(str):
"""A type of string that include language metadata. This is useful for handling multilingual content.
The class does not override any operators and functions, which means they will behave like a regular string.
"""
__slots__ = ("lang",)
def __new__(cls, content: str, lang: Optional[str] = None) -> "LangStr":
instance = str.__new__(cls, content)
return instance
def __init__(self, content: str, lang: Optional[str] = None) -> None:
if lang is not None and lang not in ISO3_ALL:
raise ValueError(f"Invalid ISO 639-3 language code: {lang}")
self.lang = lang
def __repr__(self) -> str:
if self.lang is not None:
return f'"{super().__str__()}"@{self.lang}'
return super().__repr__()
def __hash__(self) -> int:
return hash((super().__str__(), self.lang))
def __eq__(self, value: object) -> bool:
try:
return super().__eq__(value) and self.lang == value.lang # type: ignore
except AttributeError:
return super().__eq__(value)
def __ne__(self, value: object) -> bool:
return not self.__eq__(value)
|
iso_639_alpha2(code)
Convert a language identifier to an ISO 639 Part 1 code, such as "en"
or "de". For languages which do not have a two-letter identifier, or
invalid language codes, None
will be returned.
Source code in rigour/langs/__init__.py
| def iso_639_alpha2(code: str) -> Optional[str]:
"""Convert a language identifier to an ISO 639 Part 1 code, such as "en"
or "de". For languages which do not have a two-letter identifier, or
invalid language codes, ``None`` will be returned.
"""
alpha3 = iso_639_alpha3(code)
if alpha3 is None:
return None
return ISO2_MAP.get(alpha3)
|
iso_639_alpha3(code)
Convert a given language identifier into an ISO 639 Part 2 code, such
as "eng" or "deu". This will accept language codes in the two- or three-
letter format, and some language names. If the given string cannot be
converted, None
will be returned.
iso_639_alpha3('en')
'eng'
Source code in rigour/langs/__init__.py
| def iso_639_alpha3(code: str) -> Optional[str]:
"""Convert a given language identifier into an ISO 639 Part 2 code, such
as "eng" or "deu". This will accept language codes in the two- or three-
letter format, and some language names. If the given string cannot be
converted, ``None`` will be returned.
>>> iso_639_alpha3('en')
'eng'
"""
norm = normalize_code(code)
if norm is not None:
norm = ISO3_MAP.get(norm, norm)
if norm is not None:
norm = LANG_REWRITE.get(norm, norm)
if norm not in ISO3_ALL:
return None
return norm
|
list_to_alpha3(languages, synonyms=True)
Parse all the language codes in a given list into ISO 639 Part 2 codes
and optionally expand them with synonyms (i.e. other names for the same
language).
Source code in rigour/langs/__init__.py
| def list_to_alpha3(languages: Iterable[str], synonyms: bool = True) -> Set[str]:
"""Parse all the language codes in a given list into ISO 639 Part 2 codes
and optionally expand them with synonyms (i.e. other names for the same
language)."""
codes = set([])
for language in ensure_list(languages):
code = iso_639_alpha3(language)
if code is None:
continue
codes.add(code)
if synonyms:
codes.update(expand_synonyms(code))
return codes
|