Skip to content

Names

rigour.names

Name handling utilities for person and organisation names. This module contains a large (and growing) set of tools for handling names. In general, there are three types of names: people, organizations, and objects. Different normalization may be required for each of these types, including prefix removal for person names (e.g. "Mr." or "Ms.") and type normalization for organization names (e.g. "Incorporated" -> "Inc" or "Limited" -> "Ltd").

The Name class is meant to provide a structure for a name, including its original form, normalized form, metadata on the type of thing described by the name, and the language of the name. The NamePart class is used to represent individual parts of a name, such as the first name, middle name, and last name.

Name

Bases: object

A name of a thing, such as a person, organization or object. Each name consists of a sequence of parts, each of which has a form and a tag. The form is the text of the part, and the tag is a label indicating the type of part. For example, in the name "John Smith", "John" is a given name and "Smith" is a family name. The tag for "John" would be NamePartTag.GIVEN and the tag for "Smith" would be NamePartTag.FAMILY. The form for both parts would be the text of the part itself.

Source code in rigour/names/name.py
class Name(object):
    """A name of a thing, such as a person, organization or object. Each name consists of a
    sequence of parts, each of which has a form and a tag. The form is the text of the part, and the tag
    is a label indicating the type of part. For example, in the name "John Smith", "John" is a given name
    and "Smith" is a family name. The tag for "John" would be `NamePartTag.GIVEN` and the tag for "Smith"
    would be `NamePartTag.FAMILY`. The form for both parts would be the text of the part itself.
    """

    __slots__ = ["original", "form", "tag", "lang", "parts", "spans"]

    def __init__(
        self,
        original: str,
        form: Optional[str] = None,
        tag: NameTypeTag = NameTypeTag.UNK,
        lang: Optional[str] = None,
        parts: Optional[List[NamePart]] = None,
    ):
        self.original = original
        self.form = form or prenormalize_name(original)
        self.tag = tag
        self.lang = lang
        self.parts: List[NamePart] = parts or []
        if parts is None:
            for i, form in enumerate(tokenize_name(self.form)):
                self.parts.append(NamePart(form, i))
        self.spans: List[Span] = []

    @property
    def comparable(self) -> str:
        """Return the ASCII representation of the name, if available."""
        return " ".join(part.comparable for part in self.parts)

    @property
    def norm_form(self) -> str:
        """Return the normalized form of the name by joining name parts."""
        return " ".join([part.form for part in self.parts])

    def tag_text(self, text: str, tag: NamePartTag, max_matches: int = 1) -> None:
        """Tags name parts from a text with a known tag type.

        For example, if the name is "John Smith", and we know that "John" is the given name,
        this method will tag that name part with NamePartTag.GIVEN.

        The tagger can skip tokens in the name. For example, if the name is
        "Karl-Theodor Maria Nikolaus zu Guttenberg", and `text` is "Karl-Theodor
        Nikolaus", both "Karl-Theodor" and "Nikolaus" will be tagged, while
        "Maria" will not be tagged.

        If `text` is not matched in full, the tagger will not tag any name parts. For example,
        if the name is "John Smith", and `text` is "John Ted", "John" will not be tagged.

        The tagger will tag up to `max_matches` occurrences of `text` in the name.
        For example, if the name is "John John Smith", and `text` is "John", both
        "John"s will be tagged if `max_matches` is >= 2.
        """
        tokens = tokenize_name(prenormalize_name(text))
        if len(tokens) == 0:
            return

        matches = 0
        matching: List[NamePart] = []
        for part in self.parts:
            next_token = tokens[len(matching)]
            if part.form == next_token:
                matching.append(part)
            # Only tag if we have matched the entire text
            if len(matching) == len(tokens):
                for part in matching:
                    if part.tag == NamePartTag.UNSET:
                        part.tag = tag
                    elif not part.tag.can_match(tag):
                        # if the part is already tagged, we check compatibility and
                        # otherwise mark it as an outcast from polite society
                        part.tag = NamePartTag.AMBIGUOUS
                matches += 1
                if matches >= max_matches:
                    return
                # Reset the list of matching parts, i.e. start over matching from the
                # beginning of the tokenized text if we haven't reached `max_matches`.
                matching = []

    def apply_phrase(self, phrase: str, symbol: Symbol) -> None:
        """Apply a symbol to a phrase in the name."""
        matching: List[NamePart] = []
        tokens = phrase.split(" ")
        for part in self.parts:
            next_token = tokens[len(matching)]
            if part.form == next_token:
                matching.append(part)
            if len(matching) == len(tokens):
                self.spans.append(Span(matching, symbol))
                matching = []

    def apply_part(self, part: NamePart, symbol: Symbol) -> None:
        """Apply a symbol to a part of the name."""
        self.spans.append(Span([part], symbol))

    @property
    def symbols(self) -> Set[Symbol]:
        """Return a dictionary of symbols applied to the name."""
        symbols: Set[Symbol] = set()
        for span in self.spans:
            symbols.add(span.symbol)
        return symbols

    def contains(self, other: "Name") -> bool:
        """Check if this name contains another name."""
        if self == other or self.tag == NameTypeTag.UNK:
            return False
        if len(self.parts) < len(other.parts):
            return False

        if self.tag == NameTypeTag.PER:
            forms = [part.comparable for part in self.parts]
            other_forms = [part.comparable for part in other.parts]
            common_forms = list_intersection(forms, other_forms)

            # we want to make this support middle initials so that
            # "John Smith" can match "J. Smith"
            for ospan in other.spans:
                if ospan.symbol.category == Symbol.Category.INITIAL:
                    if len(ospan.parts[0].form) > 1:
                        continue
                    for span in self.spans:
                        if span.symbol == ospan.symbol:
                            common_forms.append(ospan.comparable)

            # If every part of the other name is represented in the common forms,
            # we consider it a match.
            if len(common_forms) == len(other_forms):
                return True

        return other.norm_form in self.norm_form

    def __eq__(self, other: Any) -> bool:
        try:
            return self.form == other.form  # type: ignore
        except AttributeError:
            return False

    def __hash__(self) -> int:
        return hash(self.form)

    def __str__(self) -> str:
        return self.original

    def __repr__(self) -> str:
        return "<Name(%r, %r, %r)>" % (self.original, self.form, self.tag.value)

    @classmethod
    def consolidate_names(cls, names: Iterable["Name"]) -> Set["Name"]:
        """Remove short names that are contained in longer names.

        This is useful when building a matcher to prevent a scenario where a short
        version of a name ("John Smith") is matched to a query ("John K Smith"), where a longer
        version would have disqualified the match ("John K Smith" != "John R Smith").
        """
        # We call these super_names because they are (non-strict) supersets of names.
        super_names = set(names)

        for name, other in itertools.product(names, names):
            # Check if name is still in super_names, otherwise two equal names
            # will remove each other with none being left.
            if name in super_names and name.contains(other):
                # Use discard instead of remove here because other may already have been kicked out
                # by another name of which it was a subset.
                super_names.discard(other)

        return super_names

comparable property

Return the ASCII representation of the name, if available.

norm_form property

Return the normalized form of the name by joining name parts.

symbols property

Return a dictionary of symbols applied to the name.

apply_part(part, symbol)

Apply a symbol to a part of the name.

Source code in rigour/names/name.py
def apply_part(self, part: NamePart, symbol: Symbol) -> None:
    """Apply a symbol to a part of the name."""
    self.spans.append(Span([part], symbol))

apply_phrase(phrase, symbol)

Apply a symbol to a phrase in the name.

Source code in rigour/names/name.py
def apply_phrase(self, phrase: str, symbol: Symbol) -> None:
    """Apply a symbol to a phrase in the name."""
    matching: List[NamePart] = []
    tokens = phrase.split(" ")
    for part in self.parts:
        next_token = tokens[len(matching)]
        if part.form == next_token:
            matching.append(part)
        if len(matching) == len(tokens):
            self.spans.append(Span(matching, symbol))
            matching = []

consolidate_names(names) classmethod

Remove short names that are contained in longer names.

This is useful when building a matcher to prevent a scenario where a short version of a name ("John Smith") is matched to a query ("John K Smith"), where a longer version would have disqualified the match ("John K Smith" != "John R Smith").

Source code in rigour/names/name.py
@classmethod
def consolidate_names(cls, names: Iterable["Name"]) -> Set["Name"]:
    """Remove short names that are contained in longer names.

    This is useful when building a matcher to prevent a scenario where a short
    version of a name ("John Smith") is matched to a query ("John K Smith"), where a longer
    version would have disqualified the match ("John K Smith" != "John R Smith").
    """
    # We call these super_names because they are (non-strict) supersets of names.
    super_names = set(names)

    for name, other in itertools.product(names, names):
        # Check if name is still in super_names, otherwise two equal names
        # will remove each other with none being left.
        if name in super_names and name.contains(other):
            # Use discard instead of remove here because other may already have been kicked out
            # by another name of which it was a subset.
            super_names.discard(other)

    return super_names

contains(other)

Check if this name contains another name.

Source code in rigour/names/name.py
def contains(self, other: "Name") -> bool:
    """Check if this name contains another name."""
    if self == other or self.tag == NameTypeTag.UNK:
        return False
    if len(self.parts) < len(other.parts):
        return False

    if self.tag == NameTypeTag.PER:
        forms = [part.comparable for part in self.parts]
        other_forms = [part.comparable for part in other.parts]
        common_forms = list_intersection(forms, other_forms)

        # we want to make this support middle initials so that
        # "John Smith" can match "J. Smith"
        for ospan in other.spans:
            if ospan.symbol.category == Symbol.Category.INITIAL:
                if len(ospan.parts[0].form) > 1:
                    continue
                for span in self.spans:
                    if span.symbol == ospan.symbol:
                        common_forms.append(ospan.comparable)

        # If every part of the other name is represented in the common forms,
        # we consider it a match.
        if len(common_forms) == len(other_forms):
            return True

    return other.norm_form in self.norm_form

tag_text(text, tag, max_matches=1)

Tags name parts from a text with a known tag type.

For example, if the name is "John Smith", and we know that "John" is the given name, this method will tag that name part with NamePartTag.GIVEN.

The tagger can skip tokens in the name. For example, if the name is "Karl-Theodor Maria Nikolaus zu Guttenberg", and text is "Karl-Theodor Nikolaus", both "Karl-Theodor" and "Nikolaus" will be tagged, while "Maria" will not be tagged.

If text is not matched in full, the tagger will not tag any name parts. For example, if the name is "John Smith", and text is "John Ted", "John" will not be tagged.

The tagger will tag up to max_matches occurrences of text in the name. For example, if the name is "John John Smith", and text is "John", both "John"s will be tagged if max_matches is >= 2.

Source code in rigour/names/name.py
def tag_text(self, text: str, tag: NamePartTag, max_matches: int = 1) -> None:
    """Tags name parts from a text with a known tag type.

    For example, if the name is "John Smith", and we know that "John" is the given name,
    this method will tag that name part with NamePartTag.GIVEN.

    The tagger can skip tokens in the name. For example, if the name is
    "Karl-Theodor Maria Nikolaus zu Guttenberg", and `text` is "Karl-Theodor
    Nikolaus", both "Karl-Theodor" and "Nikolaus" will be tagged, while
    "Maria" will not be tagged.

    If `text` is not matched in full, the tagger will not tag any name parts. For example,
    if the name is "John Smith", and `text` is "John Ted", "John" will not be tagged.

    The tagger will tag up to `max_matches` occurrences of `text` in the name.
    For example, if the name is "John John Smith", and `text` is "John", both
    "John"s will be tagged if `max_matches` is >= 2.
    """
    tokens = tokenize_name(prenormalize_name(text))
    if len(tokens) == 0:
        return

    matches = 0
    matching: List[NamePart] = []
    for part in self.parts:
        next_token = tokens[len(matching)]
        if part.form == next_token:
            matching.append(part)
        # Only tag if we have matched the entire text
        if len(matching) == len(tokens):
            for part in matching:
                if part.tag == NamePartTag.UNSET:
                    part.tag = tag
                elif not part.tag.can_match(tag):
                    # if the part is already tagged, we check compatibility and
                    # otherwise mark it as an outcast from polite society
                    part.tag = NamePartTag.AMBIGUOUS
            matches += 1
            if matches >= max_matches:
                return
            # Reset the list of matching parts, i.e. start over matching from the
            # beginning of the tokenized text if we haven't reached `max_matches`.
            matching = []

NamePart

Bases: object

A part of a name, such as a given name or family name. This object is used to compare and match names. It generates and caches representations of the name in various processing forms.

Source code in rigour/names/part.py
class NamePart(object):
    """A part of a name, such as a given name or family name. This object is used to compare
    and match names. It generates and caches representations of the name in various processing
    forms."""

    __slots__ = ["form", "index", "tag", "latinize", "numeric", "_ascii", "_hash"]

    def __init__(
        self,
        form: str,
        index: Optional[int] = None,
        tag: NamePartTag = NamePartTag.UNSET,
    ) -> None:
        self.form = form
        self.index = index

        self.tag = tag
        """Part tag, see NamePartTag."""

        self.latinize = can_latinize(form)
        """Whether this part can be latinized."""

        self.numeric = form.isnumeric()
        """Whether this part is numeric."""

        self._ascii: Optional[str] = None
        self._hash = hash((self.index, self.form))

    @property
    def ascii(self) -> Optional[str]:
        if self._ascii is None:
            if self.numeric:
                value = self.integer
                if value is not None:
                    self._ascii = str(value)
                    return self._ascii
            out = ascii_text(self.form)
            self._ascii = "".join(o for o in out if o.isalnum())
        return self._ascii if len(self._ascii) > 0 else None

    @property
    def integer(self) -> Optional[int]:
        if self.numeric:
            numeric = string_number(self.form)
            if numeric is not None and numeric.is_integer():
                return int(numeric)
        return None

    @property
    def comparable(self) -> str:
        if self.numeric:
            return str(self.integer)
        if not self.latinize:
            return self.form
        ascii = self.ascii
        if ascii is None:
            return self.form
        return ascii

    @property
    def metaphone(self) -> Optional[str]:
        if self.latinize and not self.numeric:
            text = self.ascii
            if text is not None and len(text) > 2:
                return metaphone(text)
        return None

    def can_match(self, other: "NamePart") -> bool:
        """Check if this part can match another part. This is based on the tags of the parts."""
        return self.tag.can_match(other.tag)

    def __eq__(self, other: Any) -> bool:
        try:
            return other._hash == self._hash  # type: ignore
        except AttributeError:
            return False

    def __hash__(self) -> int:
        return self._hash

    def __len__(self) -> int:
        return len(self.form)

    def __repr__(self) -> str:
        return "<NamePart(%r, %s, %r)>" % (self.form, self.index, self.tag.value)

    @classmethod
    def tag_sort(cls, parts: list["NamePart"]) -> list["NamePart"]:
        """Sort name parts by their index."""
        return sorted(parts, key=lambda np: NAME_TAGS_ORDER.index(np.tag))

latinize = can_latinize(form) instance-attribute

Whether this part can be latinized.

numeric = form.isnumeric() instance-attribute

Whether this part is numeric.

tag = tag instance-attribute

Part tag, see NamePartTag.

can_match(other)

Check if this part can match another part. This is based on the tags of the parts.

Source code in rigour/names/part.py
def can_match(self, other: "NamePart") -> bool:
    """Check if this part can match another part. This is based on the tags of the parts."""
    return self.tag.can_match(other.tag)

tag_sort(parts) classmethod

Sort name parts by their index.

Source code in rigour/names/part.py
@classmethod
def tag_sort(cls, parts: list["NamePart"]) -> list["NamePart"]:
    """Sort name parts by their index."""
    return sorted(parts, key=lambda np: NAME_TAGS_ORDER.index(np.tag))

NamePartTag

Bases: Enum

Within a name, identify name part types.

Source code in rigour/names/tag.py
class NamePartTag(Enum):
    """Within a name, identify name part types."""

    UNSET = "UNSET"
    AMBIGUOUS = "AMBIGUOUS"

    TITLE = "TITLE"
    GIVEN = "GIVEN"
    MIDDLE = "MIDDLE"
    FAMILY = "FAMILY"
    TRIBAL = "TRIBAL"
    PATRONYMIC = "PATRONYMIC"
    MATRONYMIC = "MATRONYMIC"
    HONORIFIC = "HONORIFIC"
    SUFFIX = "SUFFIX"
    NICK = "NICK"

    STOP = "STOP"  # Stopword
    NUM = "NUM"
    LEGAL = "LEGAL"  # Legal form of an organisation

    def can_match(self, other: "NamePartTag") -> bool:
        """Check if this tag can match the other tag."""
        if self in WILDCARDS or other in WILDCARDS:
            return True
        if self == other:
            return True
        if self in GIVEN_NAME_TAGS and other not in GIVEN_NAME_TAGS:
            return False
        if self in FAMILY_NAME_TAGS and other not in FAMILY_NAME_TAGS:
            return False
        return True

can_match(other)

Check if this tag can match the other tag.

Source code in rigour/names/tag.py
def can_match(self, other: "NamePartTag") -> bool:
    """Check if this tag can match the other tag."""
    if self in WILDCARDS or other in WILDCARDS:
        return True
    if self == other:
        return True
    if self in GIVEN_NAME_TAGS and other not in GIVEN_NAME_TAGS:
        return False
    if self in FAMILY_NAME_TAGS and other not in FAMILY_NAME_TAGS:
        return False
    return True

NameTypeTag

Bases: Enum

Metadata on what sort of object is described by a name

Source code in rigour/names/tag.py
class NameTypeTag(Enum):
    """Metadata on what sort of object is described by a name"""

    UNK = "UNK"  # Unknown
    ENT = "ENT"  # Entity
    PER = "PER"  # Person
    ORG = "ORG"  # Organization/Company
    OBJ = "OBJ"  # Object - Vessel, Security, etc.

Span

A span is a set of parts of a name that have been tagged with a symbol.

Source code in rigour/names/part.py
class Span:
    """A span is a set of parts of a name that have been tagged with a symbol."""

    __slots__ = ["parts", "symbol"]

    def __init__(self, parts: List[NamePart], symbol: Symbol) -> None:
        self.parts = tuple(parts)
        self.symbol = symbol

    @property
    def comparable(self) -> str:
        """Return the comparison-suited string representation of the span."""
        return " ".join([part.comparable for part in self.parts])

    # @property
    # def numeric(self) -> bool:
    #     """Return whether all parts in the span are numeric."""
    #     if len(self.parts) != 1:
    #         return False
    #     return self.parts[0].numeric

    def __len__(self) -> int:
        """Return the number of parts in the span."""
        return sum(len(part) for part in self.parts)

    def __hash__(self) -> int:
        return hash((self.parts, self.symbol))

    def __eq__(self, other: Any) -> bool:
        return hash(self) == hash(other)

    def __repr__(self) -> str:
        return f"<Span({self.parts!r}, {self.symbol})>"

comparable property

Return the comparison-suited string representation of the span.

__len__()

Return the number of parts in the span.

Source code in rigour/names/part.py
def __len__(self) -> int:
    """Return the number of parts in the span."""
    return sum(len(part) for part in self.parts)

Symbol

A symbol is a semantic interpretation applied to one or more parts of a name. Symbols can represent various categories such as organization classes, initials, names, numeric, or phonetic transcriptions. Each symbol has a category and an identifier.

Source code in rigour/names/symbol.py
class Symbol:
    """A symbol is a semantic interpretation applied to one or more parts of a name. Symbols can
    represent various categories such as organization classes, initials, names, numeric, or phonetic
    transcriptions. Each symbol has a category and an identifier."""

    class Category(Enum):
        # ORG_TYPE = "ORGTYPE"
        ORG_CLASS = "ORGCLS"
        SYMBOL = "SYMBOL"
        INITIAL = "INITIAL"
        NAME = "NAME"
        NICK = "NICK"
        NUMERIC = "NUM"
        LOCATION = "LOC"
        PHONETIC = "PHON"

    __slots__ = ["category", "id"]

    def __init__(self, category: Category, id: Any) -> None:
        """Create a symbol with a category and an id."""
        self.category = category
        self.id = id

    def __hash__(self) -> int:
        return hash((self.category, self.id))

    def __eq__(self, other: Any) -> bool:
        if not isinstance(other, Symbol):
            return False
        return self.category == other.category and self.id == other.id

    def __str__(self) -> str:
        return f"[{self.category.value}:{self.id}]"

    def __repr__(self) -> str:
        return f"<Symbol({self.category}, {self.id})>"

__init__(category, id)

Create a symbol with a category and an id.

Source code in rigour/names/symbol.py
def __init__(self, category: Category, id: Any) -> None:
    """Create a symbol with a category and an id."""
    self.category = category
    self.id = id

align_person_name_order(left, right)

Aligns the name parts of a person name for two names based on their tags and their string similarity such that the most similar name parts are matched.

Parameters:

Name Type Description Default
left List[NamePart]

The name parts of the first name.

required
right List[NamePart]

The name parts of the second name.

required

Returns:

Type Description
Tuple[List[NamePart], List[NamePart]]

Tuple[List[NamePart], List[NamePart]]: A tuple containing the sorted name parts of both names.

Source code in rigour/names/alignment.py
def align_person_name_order(
    left: List[NamePart], right: List[NamePart]
) -> Tuple[List[NamePart], List[NamePart]]:
    """Aligns the name parts of a person name for two names based on their tags and their string
    similarity such that the most similar name parts are matched.

    Args:
        left (List[NamePart]): The name parts of the first name.
        right (List[NamePart]): The name parts of the second name.

    Returns:
        Tuple[List[NamePart], List[NamePart]]: A tuple containing the sorted name parts of both names.
    """
    if not len(left):
        return (left, NamePart.tag_sort(right))

    left_sorted: List[NamePart] = []
    right_sorted: List[NamePart] = []

    left_unused = sorted(left, key=len, reverse=True)
    right_unused = sorted(right, key=len, reverse=True)
    while len(left_unused) > 0 and len(right_unused) > 0:
        best_score = 0.0
        best_left_parts: Optional[List[NamePart]] = None
        best_right_parts: Optional[List[NamePart]] = None
        for qp, rp in product(left_unused, right_unused):
            if not NamePartTag.can_match(qp.tag, rp.tag):
                continue
            if qp.comparable == rp.comparable:
                best_score = 1.0
                best_left_parts = [qp]
                best_right_parts = [rp]
                break
            # check the Levenshtein distance between the two parts
            score = _name_levenshtein([qp], [rp])
            if score > best_score:
                best_left_parts = [qp]
                best_right_parts = [rp]
                if len(qp.form) > len(rp.form):
                    best_right_parts = _pack_short_parts(qp, rp, right_unused)
                elif len(rp.form) > len(qp.form):
                    best_left_parts = _pack_short_parts(rp, qp, left_unused)
                best_score = _name_levenshtein(best_left_parts, best_right_parts)

        if best_score == 0.0:
            # no match found, break out of the loop
            break

        if best_left_parts is not None:
            left_sorted.extend(best_left_parts)
            for qp in best_left_parts:
                left_unused.remove(qp)
        if best_right_parts is not None:
            right_sorted.extend(best_right_parts)
            for rp in best_right_parts:
                right_unused.remove(rp)

    if not len(left_sorted):
        return (NamePart.tag_sort(left), NamePart.tag_sort(right))

    left_sorted.extend(left_unused)
    right_sorted.extend(right_unused)
    return (left_sorted, right_sorted)

extract_org_types(name, normalizer=_normalize_compare, generic=False)

Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and return the extracted type.

This can be used as a very poor man's method to determine if a given string is a company name.

Parameters:

Name Type Description Default
name str

The text to be processed. It is assumed to be already normalized (see below).

required
normalizer Callable[[str | None], str | None]

A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.

_normalize_compare
generic bool

If True, return the generic form of the organization type (e.g. LLC, JSC) instead of the type-specific comparison form (GmbH, AB, NV).

False

Returns:

Type Description
List[Tuple[str, str]]

Tuple[str, str]: Tuple of the org type as matched, and the compare form of it.

Source code in rigour/names/org_types.py
def extract_org_types(
    name: str, normalizer: Normalizer = _normalize_compare, generic: bool = False
) -> List[Tuple[str, str]]:
    """Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and
    return the extracted type.

    This can be used as a very poor man's method to determine if a given string is a company name.

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.
        generic (bool): If True, return the generic form of the organization type (e.g. LLC, JSC) instead
            of the type-specific comparison form (GmbH, AB, NV).

    Returns:
        Tuple[str, str]: Tuple of the org type as matched, and the compare form of it.
    """
    _func = _generic_replacer if generic else _compare_replacer
    replacer = _func(normalizer=normalizer)
    matches: List[Tuple[str, str]] = []
    for matched in replacer.extract(name):
        matches.append((matched, replacer.mapping.get(matched, matched)))
    return matches

is_name(name)

Check if the given string is a name. The string is considered a name if it contains at least one character that is a letter (category 'L' in Unicode).

Source code in rigour/names/check.py
def is_name(name: str) -> bool:
    """Check if the given string is a name. The string is considered a name if it contains at least
    one character that is a letter (category 'L' in Unicode)."""
    for char in name:
        category = unicodedata.category(char)
        if category[0] == "L":
            return True
    return False

is_stopword(form, *, normalizer=normalize_name, normalize=False)

Check if the given form is a stopword. The stopword list is normalized first.

Parameters:

Name Type Description Default
form str

The token to check, must already be normalized.

required
normalizer Normalizer

The normalizer to use for checking stopwords.

normalize_name
normalize bool

Whether to normalize the form before checking.

False

Returns:

Name Type Description
bool bool

True if the form is a stopword, False otherwise.

Source code in rigour/names/check.py
def is_stopword(
    form: str, *, normalizer: Normalizer = normalize_name, normalize: bool = False
) -> bool:
    """Check if the given form is a stopword. The stopword list is normalized first.

    Args:
        form (str): The token to check, must already be normalized.
        normalizer (Normalizer): The normalizer to use for checking stopwords.
        normalize (bool): Whether to normalize the form before checking.

    Returns:
        bool: True if the form is a stopword, False otherwise.
    """
    norm_form = normalizer(form) if normalize else form
    if norm_form is None:
        return False
    stopwords = _load_stopwords(normalizer)
    return norm_form in stopwords

load_person_names()

Load the person QID to name mappings from disk. This is a collection of aliases (in various alphabets) of person name parts mapped to a Wikidata QID representing that name part.

Returns:

Type Description
None

Generator[Tuple[str, List[str]], None, None]: A generator yielding tuples of QID and list of names.

Source code in rigour/names/person.py
def load_person_names() -> Generator[Tuple[str, List[str]], None, None]:
    """Load the person QID to name mappings from disk. This is a collection
    of aliases (in various alphabets) of person name parts mapped to a
    Wikidata QID representing that name part.

    Returns:
        Generator[Tuple[str, List[str]], None, None]: A generator yielding tuples of QID and list of names.
    """
    with open(NAMES_DATA_PATH, "r", encoding="utf-8") as fh:
        for line in fh:
            line = line.strip()
            names_, qid = line.split(" => ")
            names = names_.split(", ")
            yield qid, names

load_person_names_mapping(normalizer=noop_normalizer, min_mappings=1)

Load the person QID to name mappings from disk. This is a collection of aliases (in various alphabets) of person name parts mapped to a Wikidata QID representing that name part.

Parameters:

Name Type Description Default
normalizer Normalizer

A function to normalize names. Defaults to noop_normalizer.

noop_normalizer

Returns:

Type Description
Dict[str, Set[str]]

Dict[str, Set[str]]: A dictionary mapping normalized names to sets of QIDs.

Source code in rigour/names/person.py
def load_person_names_mapping(
    normalizer: Normalizer = noop_normalizer, min_mappings: int = 1
) -> Dict[str, Set[str]]:
    """Load the person QID to name mappings from disk. This is a collection
    of aliases (in various alphabets) of person name parts mapped to a
    Wikidata QID representing that name part.

    Args:
        normalizer (Normalizer, optional): A function to normalize names. Defaults to noop_normalizer.

    Returns:
        Dict[str, Set[str]]: A dictionary mapping normalized names to sets of QIDs.
    """
    names: Dict[str, Set[str]] = {}
    for qid, aliases in load_person_names():
        forms: Set[str] = set()
        for alias in aliases:
            norm_alias = normalizer(alias)
            if norm_alias is None or not len(norm_alias):
                continue
            forms.add(norm_alias)
        if len(forms) < min_mappings:
            continue
        for form in forms:
            if form not in names:
                names[form] = set([qid])
            else:
                names[form].add(qid)
    return names

normalize_name(name, sep=WS) cached

Normalize a name for tokenization and matching.

Source code in rigour/names/tokenize.py
@lru_cache(maxsize=MEMO_TINY)
def normalize_name(name: Optional[str], sep: str = WS) -> Optional[str]:
    """Normalize a name for tokenization and matching."""
    if name is None:
        return None
    name = prenormalize_name(name)
    joined = sep.join(tokenize_name(name))
    if len(joined) == 0:
        return None
    return joined

pick_case(names)

Pick the best mix of lower- and uppercase characters from a set of names that are identical except for case. If the names are not identical, undefined things happen (not recommended).

Parameters:

Name Type Description Default
names List[str]

A list of identical names in different cases.

required

Returns:

Name Type Description
str str

The best name for display.

Source code in rigour/names/pick.py
def pick_case(names: List[str]) -> str:
    """Pick the best mix of lower- and uppercase characters from a set of names
    that are identical except for case. If the names are not identical, undefined
    things happen (not recommended).

    Args:
        names (List[str]): A list of identical names in different cases.

    Returns:
        str: The best name for display.
    """
    if len(names) == 0:
        raise ValueError("Cannot pick a name from an empty list.")
    if len(names) == 1:
        return names[0]

    basic = sorted(names, key=len)[0].title()
    if basic in names:
        return basic

    scores: Dict[str, float] = {}
    for name in names:
        new_word = True
        # Bias for shorter names (`ẞ` over `ss`).
        errors = len(name)
        for char in name:
            if not char.isalpha():
                new_word = True
                continue
            if new_word:
                if not char.isupper():
                    errors += 2
                new_word = False
                continue
            if char.isupper():
                errors += 1
        scores[name] = errors / len(name)

    if len(scores) == 0:
        raise ValueError("Names could not be scored: %r" % names)

    return min(scores.items(), key=lambda i: (i[1], len(i[0])))[0]

pick_name(names)

Pick the best name from a list of names. This is meant to pick a centroid name, with a bias towards names in a latin script.

Parameters:

Name Type Description Default
names List[str]

A list of names.

required

Returns:

Type Description
Optional[str]

Optional[str]: The best name for display.

Source code in rigour/names/pick.py
def pick_name(names: List[str]) -> Optional[str]:
    """Pick the best name from a list of names. This is meant to pick a centroid
    name, with a bias towards names in a latin script.

    Args:
        names (List[str]): A list of names.

    Returns:
        Optional[str]: The best name for display.
    """
    weights: Dict[str, float] = defaultdict(float)
    forms: Dict[str, List[str]] = defaultdict(list)
    latin_names: List[str] = []
    for name in sorted(names):
        form = name.strip().casefold()
        if len(form) == 0:
            continue
        # even totally non-Latin names have a base weight of 1:
        latin_shr = latin_share(name)
        if latin_shr > 0.85:
            latin_names.append(name)
        weight = 1 + latin_shr
        weights[form] += weight
        forms[form].append(name)
        forms[form].append(name.title())

        norm = ascii_text(form)
        if len(norm) > 2:
            weights[norm] += weight
            forms[norm].append(name)

    if len(latin_names) == 1:
        return latin_names[0]

    for form in levenshtein_pick(list(weights.keys()), weights):
        for surface in levenshtein_pick(forms.get(form, []), {}):
            if surface in names:
                return surface
    return None

prenormalize_name(name)

Prepare a name for tokenization and matching.

Source code in rigour/names/tokenize.py
def prenormalize_name(name: Optional[str]) -> str:
    """Prepare a name for tokenization and matching."""
    if name is None:
        return ""
    # name = unicodedata.normalize("NFC", name)
    return name.casefold()

reduce_names(names)

Select a reduced set of names from a list of names. This is used to prepare the set of names linked to a person, organization, or other entity for publication.

Parameters:

Name Type Description Default
names List[str]

A list of names.

required

Returns:

Type Description
List[str]

List[str]: The reduced list of names.

Source code in rigour/names/pick.py
def reduce_names(names: List[str]) -> List[str]:
    """Select a reduced set of names from a list of names. This is used to
    prepare the set of names linked to a person, organization, or other entity
    for publication.

    Args:
        names (List[str]): A list of names.

    Returns:
        List[str]: The reduced list of names.
    """
    if len(names) < 2:
        return [n for n in names if is_name(n)]
    lower: Dict[str, List[str]] = defaultdict(list)
    for name in names:
        # Filter names that are not valid (e.g. empty or do not contain any letters)
        if not is_name(name):
            log.warning("Invalid name found: %r", name)
            continue
        lower[name.casefold()].append(name)
    reduced: List[str] = []
    for group in lower.values():
        try:
            picked = pick_case(group)
            reduced.append(picked)
        except ValueError:
            log.exception("Could not pick name from group: %r", group)
            reduced.extend(group)
    return reduced

remove_obj_prefixes(name)

Remove prefixes like The, MV, etc.

Source code in rigour/names/prefix.py
def remove_obj_prefixes(name: str) -> str:
    """Remove prefixes like The, MV, etc."""
    return re_prefixes(OBJ_NAME_PREFIXES).sub("", name)

remove_org_prefixes(name)

Remove prefixes like Mr., Mrs., etc.

Source code in rigour/names/prefix.py
def remove_org_prefixes(name: str) -> str:
    """Remove prefixes like Mr., Mrs., etc."""
    return re_prefixes(ORG_NAME_PREFIXES).sub("", name)

remove_org_types(name, replacement='', normalizer=_normalize_compare)

Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and replace it with the given fixed string (empty by default, which signals removal).

Parameters:

Name Type Description Default
name str

The text to be processed. It is assumed to be already normalized (see below).

required
normalizer Callable[[str | None], str | None]

A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.

_normalize_compare

Returns:

Name Type Description
str str

The text with organization types replaced/removed.

Source code in rigour/names/org_types.py
def remove_org_types(
    name: str, replacement: str = "", normalizer: Normalizer = _normalize_compare
) -> str:
    """Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and
    replace it with the given fixed string (empty by default, which signals removal).

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.

    Returns:
        str: The text with organization types replaced/removed.
    """
    replacer = _compare_replacer(normalizer=normalizer)
    return replacer.remove(name, replacement=replacement)

remove_person_prefixes(name)

Remove prefixes like Mr., Mrs., etc.

Source code in rigour/names/prefix.py
def remove_person_prefixes(name: str) -> str:
    """Remove prefixes like Mr., Mrs., etc."""
    return re_prefixes(PERSON_NAME_PREFIXES).sub("", name)

replace_org_types_compare(name, normalizer=_normalize_compare, generic=False) cached

Replace any organization type indicated in the given entity name (often as a prefix or suffix) with a heavily normalized form label. This will re-write country-specific entity types (eg. GmbH) into a simplified spelling suitable for comparison using string distance. The resulting text is meant to be used in comparison processes, but no longer fit for presentation to a user.

Parameters:

Name Type Description Default
name str

The text to be processed. It is assumed to be already normalized (see below).

required
normalizer Callable[[str | None], str | None]

A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.

_normalize_compare
generic bool

If True, return the generic form of the organization type (e.g. LLC, JSC) instead of the type-specific comparison form (GmbH, AB, NV).

False

Returns:

Type Description
str

Optional[str]: The text with organization types replaced.

Source code in rigour/names/org_types.py
@lru_cache(maxsize=1024)
def replace_org_types_compare(
    name: str, normalizer: Normalizer = _normalize_compare, generic: bool = False
) -> str:
    """Replace any organization type indicated in the given entity name (often as a prefix or suffix)
    with a heavily normalized form label. This will re-write country-specific entity types (eg. GmbH)
    into a simplified spelling suitable for comparison using string distance. The resulting text is
    meant to be used in comparison processes, but no longer fit for presentation to a user.

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.
        generic (bool): If True, return the generic form of the organization type (e.g. LLC, JSC) instead
            of the type-specific comparison form (GmbH, AB, NV).

    Returns:
        Optional[str]: The text with organization types replaced.
    """
    _func = _generic_replacer if generic else _compare_replacer
    replacer = _func(normalizer=normalizer)
    return replacer(name) or name

replace_org_types_display(name, normalizer=normalize_display)

Replace organization types in the text with their shortened form. This will perform a display-safe (light) form of normalization, useful for shortening spelt-out legal forms into common abbreviations (eg. Siemens Aktiengesellschaft -> Siemens AG).

If the result of the replacement yields an empty string, the original text is returned as-is.

Parameters:

Name Type Description Default
name str

The text to be processed. It is assumed to be already normalized (see below).

required
normalizer Callable[[str | None], str | None]

A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.

normalize_display

Returns:

Type Description
str

Optional[str]: The text with organization types replaced.

Source code in rigour/names/org_types.py
def replace_org_types_display(
    name: str, normalizer: Normalizer = normalize_display
) -> str:
    """Replace organization types in the text with their shortened form. This will perform
    a display-safe (light) form of normalization, useful for shortening spelt-out legal forms
    into common abbreviations (eg. Siemens Aktiengesellschaft -> Siemens AG).

    If the result of the replacement yields an empty string, the original text is returned as-is.

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.

    Returns:
        Optional[str]: The text with organization types replaced.
    """
    is_uppercase = name.isupper()
    replacer = _display_replacer(normalizer=normalizer)
    out_text = replacer(name)
    if out_text is None:
        return name
    if is_uppercase:
        out_text = out_text.upper()
    return out_text

tag_org_name(name, normalizer)

Tag the name with the organization type and symbol tags.

Source code in rigour/names/tagging.py
def tag_org_name(name: Name, normalizer: Normalizer) -> Name:
    """Tag the name with the organization type and symbol tags."""
    tagger = _get_org_tagger(normalizer)
    for phrase, symbol in tagger(name.norm_form):
        name.apply_phrase(phrase, symbol)
    return _infer_part_tags(name)

tag_person_name(name, normalizer, any_initials=False)

Tag a person's name with the person name part and other symbol tags.

Source code in rigour/names/tagging.py
def tag_person_name(
    name: Name, normalizer: Normalizer, any_initials: bool = False
) -> Name:
    """Tag a person's name with the person name part and other symbol tags."""
    # tag given name abbreviations. this is meant to handle a case where the person's
    # first or middle name is an abbreviation, e.g. "J. Smith" or "John Q. Smith"
    for part in name.parts:
        if not part.latinize:
            continue
        sym = Symbol(Symbol.Category.INITIAL, part.comparable[0])
        if any_initials and len(part.form) == 1:
            name.apply_part(part, sym)
        elif part.tag in INTITIAL_TAGS:
            name.apply_part(part, sym)

    # tag the name with person symbols
    tagger = _get_person_tagger(normalizer)
    for phrase, symbol in tagger(name.norm_form):
        name.apply_phrase(phrase, symbol)

    return _infer_part_tags(name)

tokenize_name(text, token_min_length=1)

Split a person or entity's name into name parts.

Source code in rigour/names/tokenize.py
def tokenize_name(text: str, token_min_length: int = 1) -> List[str]:
    """Split a person or entity's name into name parts."""
    # FIXME: Do we want to support CJK scripts at some stage?
    tokens: List[str] = []
    token: List[str] = []
    # TODO: Do we want to do some form of unicode normalization here?
    # text = unicodedata.normalize("NFC", text)
    for char in text:
        if char in SKIP_CHARACTERS:
            continue
        cat = unicodedata.category(char)
        chr = TOKEN_SEP_CATEGORIES.get(cat, char)
        if chr is None:
            continue
        if chr == WS:
            if len(token) >= token_min_length:
                tokens.append("".join(token))
            token.clear()
            continue
        token.append(chr)

    if len(token) >= token_min_length:
        tokens.append("".join(token))
    return tokens