Skip to content

Languages

rigour.langs

Language code handling

This library helps to normalise the ISO 639 codes used to describe languages from two-letter codes to three letters, and vice versa.

import rigour.langs as languagecodes

assert 'eng' == languagecodes.iso_639_alpha3('en')
assert 'eng' == languagecodes.iso_639_alpha3('ENG ')
assert 'en' == languagecodes.iso_639_alpha2('ENG ')

Uses data from: https://iso639-3.sil.org/ See also: https://www.loc.gov/standards/iso639-2/php/code_list.php

LangStr

Bases: str

A type of string that include language metadata. This is useful for handling multilingual content.

The class does not override any operators and functions, which means they will behave like a regular string.

Source code in rigour/langs/text.py
class LangStr(str):
    """A type of string that include language metadata. This is useful for handling multilingual content.

    The class does not override any operators and functions, which means they will behave like a regular string.
    """

    __slots__ = ("lang",)

    def __new__(cls, content: str, lang: Optional[str] = None) -> "LangStr":
        instance = str.__new__(cls, content)
        return instance

    def __init__(self, content: str, lang: Optional[str] = None) -> None:
        if lang is not None and lang not in ISO3_ALL:
            raise ValueError(f"Invalid ISO 639-3 language code: {lang}")
        self.lang = lang

    def __repr__(self) -> str:
        if self.lang is not None:
            return f'"{super().__str__()}"@{self.lang}'
        return super().__repr__()

    def __hash__(self) -> int:
        return hash((super().__str__(), self.lang))

    def __eq__(self, value: object) -> bool:
        try:
            return super().__eq__(value) and self.lang == value.lang  # type: ignore
        except AttributeError:
            return super().__eq__(value)

    def __ne__(self, value: object) -> bool:
        return not self.__eq__(value)

iso_639_alpha2(code)

Convert a language identifier to an ISO 639 Part 1 code, such as "en" or "de". For languages which do not have a two-letter identifier, or invalid language codes, None will be returned.

Source code in rigour/langs/__init__.py
def iso_639_alpha2(code: str) -> Optional[str]:
    """Convert a language identifier to an ISO 639 Part 1 code, such as "en"
    or "de". For languages which do not have a two-letter identifier, or
    invalid language codes, ``None`` will be returned.
    """
    alpha3 = iso_639_alpha3(code)
    if alpha3 is None:
        return None
    return ISO2_MAP.get(alpha3)

iso_639_alpha3(code)

Convert a given language identifier into an ISO 639 Part 2 code, such as "eng" or "deu". This will accept language codes in the two- or three- letter format, and some language names. If the given string cannot be converted, None will be returned.

iso_639_alpha3('en') 'eng'

Source code in rigour/langs/__init__.py
def iso_639_alpha3(code: str) -> Optional[str]:
    """Convert a given language identifier into an ISO 639 Part 2 code, such
    as "eng" or "deu". This will accept language codes in the two- or three-
    letter format, and some language names. If the given string cannot be
    converted, ``None`` will be returned.

    >>> iso_639_alpha3('en')
    'eng'
    """
    norm = normalize_code(code)
    if norm is not None:
        norm = ISO3_MAP.get(norm, norm)
    if norm is not None:
        norm = LANG_REWRITE.get(norm, norm)
    if norm not in ISO3_ALL:
        return None
    return norm

list_to_alpha3(languages, synonyms=True)

Parse all the language codes in a given list into ISO 639 Part 2 codes and optionally expand them with synonyms (i.e. other names for the same language).

Source code in rigour/langs/__init__.py
def list_to_alpha3(languages: Iterable[str], synonyms: bool = True) -> Set[str]:
    """Parse all the language codes in a given list into ISO 639 Part 2 codes
    and optionally expand them with synonyms (i.e. other names for the same
    language)."""
    codes = set([])
    for language in ensure_list(languages):
        code = iso_639_alpha3(language)
        if code is None:
            continue
        codes.add(code)
        if synonyms:
            codes.update(expand_synonyms(code))
    return codes