Skip to content

Languages

rigour.langs

Language code handling

This library helps to normalise the ISO 639 codes used to describe languages from two-letter codes to three letters, and vice versa.

import rigour.langs as languagecodes

assert 'eng' == languagecodes.iso_639_alpha3('en')
assert 'eng' == languagecodes.iso_639_alpha3('ENG ')
assert 'en' == languagecodes.iso_639_alpha2('ENG ')

Uses data from: https://iso639-3.sil.org/ See also: https://www.loc.gov/standards/iso639-2/php/code_list.php

LangStr

Bases: str

A type of string that include language metadata. This is useful for handling multilingual content.

The class does not override any operators and functions, which means they will behave like a regular string.

Source code in rigour/langs/text.py
class LangStr(str):
    """A type of string that include language metadata. This is useful for handling multilingual content.

    The class does not override any operators and functions, which means they will behave like a regular string.
    """

    __slots__ = ("lang",)

    def __new__(cls, content: str, lang: Optional[str] = None) -> "LangStr":
        instance = str.__new__(cls, content)
        return instance

    def __init__(self, content: str, lang: Optional[str] = None) -> None:
        if lang is not None and lang not in ISO3_ALL:
            raise ValueError(f"Invalid ISO 639-3 language code: {lang}")
        self.lang = lang

    def __repr__(self) -> str:
        if self.lang is not None:
            return f'"{super().__str__()}"@{self.lang}'
        return super().__repr__()

    def __hash__(self) -> int:
        return hash((super().__str__(), self.lang))

    def __eq__(self, value: object) -> bool:
        try:
            return super().__eq__(value) and self.lang == value.lang  # type: ignore
        except AttributeError:
            return super().__eq__(value)

    def __ne__(self, value: object) -> bool:
        return not self.__eq__(value)

is_lang_better(candidate, baseline)

Decide if the candidate language code is 'better' than the baseline language code, according to the preferred languages list.

is_lang_better('eng', 'deu') True is_lang_better('fra', 'eng') False

Parameters:

Name Type Description Default
candidate str

The candidate language code.

required
baseline str

The baseline language code.

required

Returns:

Name Type Description
bool bool

True if the candidate is better than the baseline.

Source code in rigour/langs/__init__.py
def is_lang_better(candidate: str, baseline: str) -> bool:
    """Decide if the candidate language code is 'better' than the baseline
    language code, according to the preferred languages list.

     >>> is_lang_better('eng', 'deu')
     True
     >>> is_lang_better('fra', 'eng')
     False

    Args:
        candidate (str): The candidate language code.
        baseline (str): The baseline language code.

    Returns:
        bool: True if the candidate is better than the baseline.
    """
    try:
        candidate_index = PREFERRED_LANGS.index(candidate)
    except ValueError:
        candidate_index = len(PREFERRED_LANGS) + 1
    try:
        baseline_index = PREFERRED_LANGS.index(baseline)
    except ValueError:
        baseline_index = len(PREFERRED_LANGS) + 1
    return candidate_index < baseline_index

iso_639_alpha2(code)

Convert a language identifier to an ISO 639 Part 1 code, such as "en" or "de". For languages which do not have a two-letter identifier, or invalid language codes, None will be returned.

Source code in rigour/langs/__init__.py
def iso_639_alpha2(code: str) -> Optional[str]:
    """Convert a language identifier to an ISO 639 Part 1 code, such as "en"
    or "de". For languages which do not have a two-letter identifier, or
    invalid language codes, ``None`` will be returned.
    """
    alpha3 = iso_639_alpha3(code)
    if alpha3 is None:
        return None
    return ISO2_MAP.get(alpha3)

iso_639_alpha3(code)

Convert a given language identifier into an ISO 639 Part 2 code, such as "eng" or "deu". This will accept language codes in the two- or three- letter format, and some language names. If the given string cannot be converted, None will be returned.

iso_639_alpha3('en') 'eng'

Source code in rigour/langs/__init__.py
def iso_639_alpha3(code: str) -> Optional[str]:
    """Convert a given language identifier into an ISO 639 Part 2 code, such
    as "eng" or "deu". This will accept language codes in the two- or three-
    letter format, and some language names. If the given string cannot be
    converted, ``None`` will be returned.

    >>> iso_639_alpha3('en')
    'eng'
    """
    norm = normalize_code(code)
    if norm is not None:
        norm = ISO3_MAP.get(norm, norm)
    if norm is not None:
        norm = LANG_REWRITE.get(norm, norm)
    if norm not in ISO3_ALL:
        return None
    return norm

list_to_alpha3(languages, synonyms=True)

Parse all the language codes in a given list into ISO 639 Part 2 codes and optionally expand them with synonyms (i.e. other names for the same language).

Source code in rigour/langs/__init__.py
def list_to_alpha3(languages: Iterable[str], synonyms: bool = True) -> Set[str]:
    """Parse all the language codes in a given list into ISO 639 Part 2 codes
    and optionally expand them with synonyms (i.e. other names for the same
    language)."""
    codes = set([])
    for language in ensure_list(languages):
        code = iso_639_alpha3(language)
        if code is None:
            continue
        codes.add(code)
        if synonyms:
            codes.update(expand_synonyms(code))
    return codes