Skip to content

Names

rigour.names

Name handling utilities for person and organisation names. This module contains a large (and growing) set of tools for handling names. In general, there are three types of names: people, organizations, and objects. Different normalization may be required for each of these types, including prefix removal for person names (e.g. "Mr." or "Ms.") and type normalization for organization names (e.g. "Incorporated" -> "Inc" or "Limited" -> "Ltd").

The Name class is meant to provide a structure for a name, including its original form, normalized form, metadata on the type of thing described by the name, and the language of the name. The NamePart class is used to represent individual parts of a name, such as the first name, middle name, and last name.

Name

Bases: object

A name of a thing, such as a person, organization or object. Each name consists of a sequence of parts, each of which has a form and a tag. The form is the text of the part, and the tag is a label indicating the type of part. For example, in the name "John Smith", "John" is a given name and "Smith" is a family name. The tag for "John" would be NamePartTag.GIVEN and the tag for "Smith" would be NamePartTag.FAMILY. The form for both parts would be the text of the part itself.

Source code in rigour/names/name.py
class Name(object):
    """A name of a thing, such as a person, organization or object. Each name consists of a
    sequence of parts, each of which has a form and a tag. The form is the text of the part, and the tag
    is a label indicating the type of part. For example, in the name "John Smith", "John" is a given name
    and "Smith" is a family name. The tag for "John" would be `NamePartTag.GIVEN` and the tag for "Smith"
    would be `NamePartTag.FAMILY`. The form for both parts would be the text of the part itself.
    """

    __slots__ = ["original", "form", "tag", "lang", "_parts", "spans"]

    def __init__(
        self,
        original: str,
        form: Optional[str] = None,
        tag: NameTypeTag = NameTypeTag.UNK,
        lang: Optional[str] = None,
        parts: Optional[List[NamePart]] = None,
    ):
        self.original = original
        self.form = form or prenormalize_name(original)
        self.tag = tag
        self.lang = lang
        self._parts = parts
        self.spans: List[Span] = []

    @property
    def parts(self) -> List[NamePart]:
        if self._parts is None:
            self._parts = []
            for i, form in enumerate(tokenize_name(self.form)):
                self._parts.append(NamePart(form, i))
        return self._parts

    @property
    def comparable(self) -> str:
        """Return the ASCII representation of the name, if available."""
        return " ".join(part.comparable for part in self.parts)

    @property
    def norm_form(self) -> str:
        """Return the normalized form of the name by joining name parts."""
        return " ".join([part.form for part in self.parts])

    def tag_text(self, text: str, tag: NamePartTag, max_matches: int = 1) -> None:
        tokens = tokenize_name(prenormalize_name(text))
        matches = 0
        matching: List[NamePart] = []
        for part in self.parts:
            if part.tag not in (tag, NamePartTag.ANY):
                matching = []
                continue
            next_token = tokens[len(matching)]
            if part.form == next_token:
                matching.append(part)
            if len(matching) == len(tokens):
                for part in matching:
                    part.tag = tag
                matches += 1
                if matches >= max_matches:
                    return
                matching = []

    def apply_phrase(self, phrase: str, symbol: Symbol) -> None:
        """Apply a symbol to a phrase in the name."""
        matching: List[NamePart] = []
        tokens = phrase.split(" ")
        for part in self.parts:
            next_token = tokens[len(matching)]
            if part.form == next_token:
                matching.append(part)
            if len(matching) == len(tokens):
                self.spans.append(Span(matching, symbol))
                matching = []

    def apply_part(self, part: NamePart, symbol: Symbol) -> None:
        """Apply a symbol to a part of the name."""
        self.spans.append(Span([part], symbol))

    @property
    def symbols(self) -> Set[Symbol]:
        """Return a dictionary of symbols applied to the name."""
        symbols: Set[Symbol] = set()
        for span in self.spans:
            symbols.add(span.symbol)
        return symbols

    def contains(self, other: "Name") -> bool:
        """Check if this name contains another name."""
        if self == other or self.tag == NameTypeTag.UNK:
            return False
        if len(self.parts) < len(other.parts):
            return False

        if self.tag == NameTypeTag.PER:
            forms = [part.comparable for part in self.parts]
            other_forms = [part.comparable for part in other.parts]
            common_forms = list_intersection(forms, other_forms)

            # we want to make this support middle initials so that
            # "John Smith" can match "J. Smith"
            for ospan in other.spans:
                if ospan.symbol.category == Symbol.Category.INITIAL:
                    if len(ospan.parts[0].form) > 1:
                        continue
                    for span in self.spans:
                        if span.symbol == ospan.symbol:
                            common_forms.append(ospan.comparable)

            # If every part of the other name is represented in the common forms,
            # we consider it a match.
            if len(common_forms) == len(other_forms):
                return True

        return other.norm_form in self.norm_form

    def symbol_map(self) -> Dict[Symbol, List[Span]]:
        """Return a mapping of symbols to their string representations."""
        symbol_map: Dict[Symbol, List[Span]] = {}
        for span in self.spans:
            if span.symbol not in symbol_map:
                symbol_map[span.symbol] = []
            symbol_map[span.symbol].append(span)
        return symbol_map

    def __eq__(self, other: Any) -> bool:
        try:
            return self.form == other.form  # type: ignore
        except AttributeError:
            return False

    def __hash__(self) -> int:
        return hash(self.form)

    def __str__(self) -> str:
        return self.original

    def __repr__(self) -> str:
        return "<Name(%r, %r, %r)>" % (self.original, self.form, self.tag.value)

comparable property

Return the ASCII representation of the name, if available.

norm_form property

Return the normalized form of the name by joining name parts.

symbols property

Return a dictionary of symbols applied to the name.

apply_part(part, symbol)

Apply a symbol to a part of the name.

Source code in rigour/names/name.py
def apply_part(self, part: NamePart, symbol: Symbol) -> None:
    """Apply a symbol to a part of the name."""
    self.spans.append(Span([part], symbol))

apply_phrase(phrase, symbol)

Apply a symbol to a phrase in the name.

Source code in rigour/names/name.py
def apply_phrase(self, phrase: str, symbol: Symbol) -> None:
    """Apply a symbol to a phrase in the name."""
    matching: List[NamePart] = []
    tokens = phrase.split(" ")
    for part in self.parts:
        next_token = tokens[len(matching)]
        if part.form == next_token:
            matching.append(part)
        if len(matching) == len(tokens):
            self.spans.append(Span(matching, symbol))
            matching = []

contains(other)

Check if this name contains another name.

Source code in rigour/names/name.py
def contains(self, other: "Name") -> bool:
    """Check if this name contains another name."""
    if self == other or self.tag == NameTypeTag.UNK:
        return False
    if len(self.parts) < len(other.parts):
        return False

    if self.tag == NameTypeTag.PER:
        forms = [part.comparable for part in self.parts]
        other_forms = [part.comparable for part in other.parts]
        common_forms = list_intersection(forms, other_forms)

        # we want to make this support middle initials so that
        # "John Smith" can match "J. Smith"
        for ospan in other.spans:
            if ospan.symbol.category == Symbol.Category.INITIAL:
                if len(ospan.parts[0].form) > 1:
                    continue
                for span in self.spans:
                    if span.symbol == ospan.symbol:
                        common_forms.append(ospan.comparable)

        # If every part of the other name is represented in the common forms,
        # we consider it a match.
        if len(common_forms) == len(other_forms):
            return True

    return other.norm_form in self.norm_form

symbol_map()

Return a mapping of symbols to their string representations.

Source code in rigour/names/name.py
def symbol_map(self) -> Dict[Symbol, List[Span]]:
    """Return a mapping of symbols to their string representations."""
    symbol_map: Dict[Symbol, List[Span]] = {}
    for span in self.spans:
        if span.symbol not in symbol_map:
            symbol_map[span.symbol] = []
        symbol_map[span.symbol].append(span)
    return symbol_map

NamePart

Bases: object

A part of a name, such as a given name or family name. This object is used to compare and match names. It generates and caches representations of the name in various processing forms.

Source code in rigour/names/part.py
class NamePart(object):
    """A part of a name, such as a given name or family name. This object is used to compare
    and match names. It generates and caches representations of the name in various processing
    forms."""

    __slots__ = ["form", "index", "tag", "is_modern_alphabet", "_ascii"]

    def __init__(
        self,
        form: str,
        index: Optional[int] = None,
        tag: NamePartTag = NamePartTag.ANY,
    ) -> None:
        self.form = form
        self.index = index
        self.tag = tag
        self.is_modern_alphabet = is_modern_alphabet(form)
        self._ascii: Optional[str] = None

    @property
    def ascii(self) -> Optional[str]:
        if self._ascii is None:
            out = ascii_text(self.form) or ""
            self._ascii = "".join(o for o in out if o.isalnum())
        return self._ascii if len(self._ascii) > 0 else None

    @property
    def comparable(self) -> str:
        if not self.is_modern_alphabet:
            return self.form
        if self.ascii is None:
            return self.form
        return self.ascii

    @property
    def metaphone(self) -> Optional[str]:
        if self.is_modern_alphabet and self.ascii is not None:
            # doesn't handle non-ascii characters
            return metaphone(self.ascii)
        return None

    def can_match(self, other: "NamePart") -> bool:
        """Check if this part can match another part. This is based on the tags of the parts."""
        if NamePartTag.ANY in (self.tag, other.tag):
            return True
        if self.tag in GIVEN_NAME_TAGS and other.tag not in GIVEN_NAME_TAGS:
            return False
        if self.tag in FAMILY_NAME_TAGS and other.tag not in FAMILY_NAME_TAGS:
            return False
        return True

    def __eq__(self, other: Any) -> bool:
        try:
            return other.form == self.form and other.index == self.index  # type: ignore
        except AttributeError:
            return False

    def __hash__(self) -> int:
        return hash((self.index, self.form))

    def __len__(self) -> int:
        return len(self.form)

    def __repr__(self) -> str:
        return "<NamePart(%r, %s, %r)>" % (self.form, self.index, self.tag.value)

    @classmethod
    def tag_sort(cls, parts: list["NamePart"]) -> list["NamePart"]:
        """Sort name parts by their index."""
        return sorted(parts, key=lambda np: NAME_TAGS_ORDER.index(np.tag))

can_match(other)

Check if this part can match another part. This is based on the tags of the parts.

Source code in rigour/names/part.py
def can_match(self, other: "NamePart") -> bool:
    """Check if this part can match another part. This is based on the tags of the parts."""
    if NamePartTag.ANY in (self.tag, other.tag):
        return True
    if self.tag in GIVEN_NAME_TAGS and other.tag not in GIVEN_NAME_TAGS:
        return False
    if self.tag in FAMILY_NAME_TAGS and other.tag not in FAMILY_NAME_TAGS:
        return False
    return True

tag_sort(parts) classmethod

Sort name parts by their index.

Source code in rigour/names/part.py
@classmethod
def tag_sort(cls, parts: list["NamePart"]) -> list["NamePart"]:
    """Sort name parts by their index."""
    return sorted(parts, key=lambda np: NAME_TAGS_ORDER.index(np.tag))

NamePartTag

Bases: Enum

Within a name, identify name part types.

Source code in rigour/names/tag.py
class NamePartTag(Enum):
    """Within a name, identify name part types."""

    ANY = "ANY"

    TITLE = "TITLE"
    GIVEN = "GIVEN"
    MIDDLE = "MIDDLE"
    FAMILY = "FAMILY"
    TRIBAL = "TRIBAL"
    PATRONYMIC = "PATRONYMIC"
    MATRONYMIC = "MATRONYMIC"
    HONORIFIC = "HONORIFIC"
    SUFFIX = "SUFFIX"
    NICK = "NICK"

    STOP = "STOP"  # Stopword
    NUM = "NUM"
    LEGAL = "LEGAL"  # Legal form of an organisation

NameTypeTag

Bases: Enum

Metadata on what sort of object is described by a name

Source code in rigour/names/tag.py
class NameTypeTag(Enum):
    """Metadata on what sort of object is described by a name"""

    UNK = "UNK"  # Unknown
    ENT = "ENT"  # Entity
    PER = "PER"  # Person
    ORG = "ORG"  # Organization/Company
    OBJ = "OBJ"  # Object - Vessel, Security, etc.

Span

A span is a set of parts of a name that have been tagged with a symbol.

Source code in rigour/names/part.py
class Span:
    """A span is a set of parts of a name that have been tagged with a symbol."""

    __slots__ = ["parts", "symbol"]

    def __init__(self, parts: List[NamePart], symbol: Symbol) -> None:
        self.parts = tuple(parts)
        self.symbol = symbol

    @property
    def comparable(self) -> str:
        """Return the comparison-suited string representation of the span."""
        return " ".join([part.comparable for part in self.parts])

    def __hash__(self) -> int:
        return hash((self.parts, self.symbol))

    def __eq__(self, other: Any) -> bool:
        return hash(self) == hash(other)

    def __repr__(self) -> str:
        return f"<Span({self.parts!r}, {self.symbol})>"

comparable property

Return the comparison-suited string representation of the span.

Symbol

A symbol is a semantic interpretation applied to one or more parts of a name. Symbols can represent various categories such as organization classes, initials, names, ordinals, or phonetic transcriptions. Each symbol has a category and an identifier.

Source code in rigour/names/symbol.py
class Symbol:
    """A symbol is a semantic interpretation applied to one or more parts of a name. Symbols can
    represent various categories such as organization classes, initials, names, ordinals, or phonetic
    transcriptions. Each symbol has a category and an identifier."""

    class Category(Enum):
        ORG_CLASS = "ORGCLASS"
        SYMBOL = "SYMBOL"
        INITIAL = "INITIAL"
        NAME = "NAME"
        ORDINAL = "ORD"
        PHONETIC = "PHON"

    __slots__ = ["category", "id"]

    def __init__(self, category: Category, id: Any) -> None:
        """Create a symbol with a category and an id."""
        self.category = category
        self.id = id

    def __hash__(self) -> int:
        return hash((self.category, self.id))

    def __eq__(self, other: Any) -> bool:
        if not isinstance(other, Symbol):
            return False
        return self.category == other.category and self.id == other.id

    def __str__(self) -> str:
        return f"[{self.category.value}:{self.id}]"

    def __repr__(self) -> str:
        return f"<Symbol({self.category}, {self.id})>"

__init__(category, id)

Create a symbol with a category and an id.

Source code in rigour/names/symbol.py
def __init__(self, category: Category, id: Any) -> None:
    """Create a symbol with a category and an id."""
    self.category = category
    self.id = id

align_name_slop(query, result, max_slop=2)

Align name parts of companies and organizations. The idea here is to allow skipping tokens within the entity name if this improves overall match quality, but never to re-order name parts. The resulting alignment will contain the sorted name parts of both the query and the result, as well as any extra parts that were not aligned.

Note that one name part in one list may correspond to multiple name parts in the other list, so the alignment is not necessarily one-to-one.

The levenshtein distance is used to determine the best alignment, allowing for a certain spelling variation between the names.

Parameters:

Name Type Description Default
query List[NamePart]

The name parts of the query.

required
result List[NamePart]

The name parts of the result.

required
max_slop int

The maximum number of tokens that can be skipped in the alignment. Defaults to 2.

2

Returns: Alignment: An object containing the aligned name parts and any extra parts.

Source code in rigour/names/alignment.py
def align_name_slop(
    query: List[NamePart],
    result: List[NamePart],
    max_slop: int = 2,
) -> Alignment:
    """Align name parts of companies and organizations. The idea here is to allow
    skipping tokens within the entity name if this improves overall match quality,
    but never to re-order name parts. The resulting alignment will contain the
    sorted name parts of both the query and the result, as well as any extra parts
    that were not aligned.

    Note that one name part in one list may correspond to multiple name parts in the
    other list, so the alignment is not necessarily one-to-one.

    The `levenshtein` distance is used to determine the best alignment, allowing
    for a certain spelling variation between the names.

    Args:
        query (List[NamePart]): The name parts of the query.
        result (List[NamePart]): The name parts of the result.
        max_slop (int): The maximum number of tokens that can be skipped in the
            alignment. Defaults to 2.
    Returns:
        Alignment: An object containing the aligned name parts and any extra parts.
    """
    alignment = Alignment()
    if len(query) < 2 and len(result) < 2:
        alignment.query_sorted = query
        alignment.result_sorted = result
        return alignment

    query_index = 0
    result_index = 0
    while query_index < len(query) and result_index < len(result):
        # get the best alignment of query to result
        query_best = best_alignment(
            query[query_index], result[result_index : result_index + max_slop + 1]
        )
        # get the best alignment of result to query
        result_best = best_alignment(
            result[result_index],
            query[query_index : query_index + max_slop + 1],
            swap=True,
        )
        # take the best of both
        if query_best is None and result_best is None:
            # No alignment found within slop, move forward with bad alignment
            # unless we are at the end of either list.
            if query_index == len(query) - 1 or result_index == len(result) - 1:
                break
            alignment.query_sorted.append(query[query_index])
            alignment.result_sorted.append(result[result_index])
            query_index += 1
            result_index += 1
            continue
        elif query_best is not None and result_best is not None:
            if query_best.score >= result_best.score:
                best = query_best
            else:
                best = result_best
        elif query_best is not None:
            best = query_best
        elif result_best is not None:
            best = result_best
        else:
            raise ValueError("Shouldn't reach here.")
        # add the best alignment to the Alignment
        alignment.query_sorted.append(best.left)
        alignment.result_sorted.append(best.right)
        # if we skip any, add them to extra
        assert best.left.index is not None, best.left
        assert best.right.index is not None, best.right
        alignment.query_extra.extend(query[query_index : best.left.index])
        alignment.result_extra.extend(result[result_index : best.right.index])
        # move to the step after the aligned parts
        query_index = best.left.index + 1
        result_index = best.right.index + 1
    # Add slop remaining parts to extra and the rest to sorted.
    # We do this because max_slop parts are allowed to be ignored, but anything
    # beyond that should penalise any similarity comparison on the sorted parts.
    alignment.query_extra.extend(query[query_index : query_index + max_slop])
    alignment.query_sorted.extend(query[query_index + max_slop :])
    alignment.result_extra.extend(result[result_index : result_index + max_slop])
    alignment.result_sorted.extend(result[result_index + max_slop :])
    return alignment

align_name_strict(query, result, max_slop=2)

Align name parts of companies and organizations strictly by their token sequence. This implementation does not use fuzzy matching or Levenshtein distance, but rather aligns names only if individual name parts match exactly.

Parameters:

Name Type Description Default
query List[NamePart]

The name parts of the query.

required
result List[NamePart]

The name parts of the result.

required

Returns: Alignment: An object containing the aligned name parts and any extra parts.

Source code in rigour/names/alignment.py
def align_name_strict(
    query: List[NamePart], result: List[NamePart], max_slop: int = 2
) -> Alignment:
    """Align name parts of companies and organizations strictly by their token sequence. This
    implementation does not use fuzzy matching or Levenshtein distance, but rather aligns
    names only if individual name parts match exactly.

    Args:
        query (List[NamePart]): The name parts of the query.
        result (List[NamePart]): The name parts of the result.
    Returns:
        Alignment: An object containing the aligned name parts and any extra parts.
    """
    alignment = Alignment()
    if len(query) < 2 or len(result) < 2:
        alignment.query_sorted = query
        alignment.result_sorted = result
        return alignment
    query = NamePart.tag_sort(query)
    result = NamePart.tag_sort(result)
    query_offset = 0
    result_offset = 0
    while True:
        if query_offset >= len(query) or result_offset >= len(result):
            break
        slop_used = len(alignment.query_extra) + len(alignment.result_extra)
        slop_remaining = max(0, max_slop - slop_used)
        for i in range(slop_remaining + 1):
            query_next = query_offset + i
            if query_next < len(query):
                query_part = query[query_next]
                if query_part.comparable == result[result_offset].comparable:
                    alignment.query_sorted.append(query_part)
                    alignment.result_sorted.append(result[result_offset])
                    alignment.query_extra.extend(query[query_offset:query_next])
                    query_offset += i
                    break
            result_next = result_offset + i
            if result_next < len(result):
                result_part = result[result_next]
                if result_part.comparable == query[query_offset].comparable:
                    alignment.query_sorted.append(query[query_offset])
                    alignment.result_sorted.append(result_part)
                    alignment.result_extra.extend(result[result_offset:result_next])
                    result_offset += i
                    break

        query_offset += 1
        result_offset += 1

    # Add any remaining parts to extra and the rest to sorted.
    alignment.query_sorted.extend(query[query_offset:])
    alignment.result_sorted.extend(result[result_offset:])
    return alignment

align_person_name_order(query, result)

Aligns the name parts of a person name for the query and result based on their tags and their string similarity such that the most similar name parts are matched.

Parameters:

Name Type Description Default
query List[NamePart]

The name parts from the query.

required
result List[NamePart]

The name parts from the result.

required

Returns:

Name Type Description
Alignment Alignment

An object containing the aligned name parts and any extra parts.

Source code in rigour/names/alignment.py
def align_person_name_order(query: List[NamePart], result: List[NamePart]) -> Alignment:
    """Aligns the name parts of a person name for the query and result based on their
    tags and their string similarity such that the most similar name parts are matched.

    Args:
        query (List[NamePart]): The name parts from the query.
        result (List[NamePart]): The name parts from the result.

    Returns:
        Alignment: An object containing the aligned name parts and any extra parts.
    """
    alignment = Alignment()
    if not len(query):
        alignment.result_sorted = result
        return alignment

    query_left = sorted(query, key=len, reverse=True)
    result_left = sorted(result, key=len, reverse=True)
    while len(query_left) > 0 and len(result_left) > 0:
        best_score = 0.0
        best_query_parts: Optional[List[NamePart]] = None
        best_result_parts: Optional[List[NamePart]] = None
        for qp, rp in product(query_left, result_left):
            if not qp.can_match(rp):
                continue
            if qp.comparable == rp.comparable:
                best_score = 1.0
                best_query_parts = [qp]
                best_result_parts = [rp]
                break
            # check the Levenshtein distance between the two parts
            score = _name_levenshtein([qp], [rp])
            if score > best_score:
                best_query_parts = [qp]
                best_result_parts = [rp]
                if len(qp.form) > len(rp.form):
                    best_result_parts = _pack_short_parts(qp, rp, result_left)
                elif len(rp.form) > len(qp.form):
                    best_query_parts = _pack_short_parts(rp, qp, query_left)
                best_score = _name_levenshtein(best_query_parts, best_result_parts)

        if best_score == 0.0:
            # no match found, break out of the loop
            break

        if best_query_parts is not None:
            alignment.query_sorted.extend(best_query_parts)
            for qp in best_query_parts:
                query_left.remove(qp)
        if best_result_parts is not None:
            alignment.result_sorted.extend(best_result_parts)
            for rp in best_result_parts:
                result_left.remove(rp)

    if not len(alignment.query_sorted):
        return align_tag_sort(query, result)

    alignment.query_extra.extend(query_left)
    alignment.result_extra.extend(result_left)
    return alignment

align_tag_sort(query, result)

Align name parts of companies and organizations by sorting them by their tags. This is a simple alignment that does not allow for any slop or re-ordering of name parts, but it is useful for cases where the names are already well-formed and comparable.

Parameters:

Name Type Description Default
query List[NamePart]

The name parts of the query.

required
result List[NamePart]

The name parts of the result.

required

Returns: Alignment: An object containing the aligned name parts and any extra parts.

Source code in rigour/names/alignment.py
def align_tag_sort(query: List[NamePart], result: List[NamePart]) -> Alignment:
    """Align name parts of companies and organizations by sorting them by their tags.
    This is a simple alignment that does not allow for any slop or re-ordering of name
    parts, but it is useful for cases where the names are already well-formed and
    comparable.

    Args:
        query (List[NamePart]): The name parts of the query.
        result (List[NamePart]): The name parts of the result.
    Returns:
        Alignment: An object containing the aligned name parts and any extra parts.
    """
    alignment = Alignment()
    alignment.query_sorted = NamePart.tag_sort(query)
    alignment.result_sorted = NamePart.tag_sort(result)
    return alignment

extract_org_types(name, normalizer=_normalize_compare, generic=False)

Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and return the extracted type.

This can be used as a very poor man's method to determine if a given string is a company name.

Parameters:

Name Type Description Default
name str

The text to be processed. It is assumed to be already normalized (see below).

required
normalizer Callable[[str | None], str | None]

A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.

_normalize_compare
generic bool

If True, return the generic form of the organization type (e.g. LLC, JSC) instead of the type-specific comparison form (GmbH, AB, NV).

False

Returns:

Type Description
List[Tuple[str, str]]

Tuple[str, str]: Tuple of the org type as matched, and the compare form of it.

Source code in rigour/names/org_types.py
def extract_org_types(
    name: str, normalizer: Normalizer = _normalize_compare, generic: bool = False
) -> List[Tuple[str, str]]:
    """Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and
    return the extracted type.

    This can be used as a very poor man's method to determine if a given string is a company name.

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.
        generic (bool): If True, return the generic form of the organization type (e.g. LLC, JSC) instead
            of the type-specific comparison form (GmbH, AB, NV).

    Returns:
        Tuple[str, str]: Tuple of the org type as matched, and the compare form of it.
    """
    _func = _generic_replacer if generic else _compare_replacer
    replacer = _func(normalizer=normalizer)
    matches: List[Tuple[str, str]] = []
    for matched in replacer.extract(name):
        matches.append((matched, replacer.mapping.get(matched, matched)))
    return matches

is_name(name)

Check if the given string is a name. The string is considered a name if it contains at least one character that is a letter (category 'L' in Unicode).

Source code in rigour/names/check.py
def is_name(name: str) -> bool:
    """Check if the given string is a name. The string is considered a name if it contains at least
    one character that is a letter (category 'L' in Unicode)."""
    for char in name:
        category = unicodedata.category(char)
        if category[0] == "L":
            return True
    return False

load_person_names()

Load the person QID to name mappings from disk. This is a collection of aliases (in various alphabets) of person name parts mapped to a Wikidata QID representing that name part.

Returns:

Type Description
None

Generator[Tuple[str, List[str]], None, None]: A generator yielding tuples of QID and list of names.

Source code in rigour/names/person.py
def load_person_names() -> Generator[Tuple[str, List[str]], None, None]:
    """Load the person QID to name mappings from disk. This is a collection
    of aliases (in various alphabets) of person name parts mapped to a
    Wikidata QID representing that name part.

    Returns:
        Generator[Tuple[str, List[str]], None, None]: A generator yielding tuples of QID and list of names.
    """
    with open(NAMES_DATA_PATH, "r", encoding="utf-8") as fh:
        for line in fh:
            line = line.strip()
            names_, qid = line.split(" => ")
            names = names_.split(", ")
            yield qid, names

load_person_names_mapping(normalizer=noop_normalizer)

Load the person QID to name mappings from disk. This is a collection of aliases (in various alphabets) of person name parts mapped to a Wikidata QID representing that name part.

Parameters:

Name Type Description Default
normalizer Normalizer

A function to normalize names. Defaults to noop_normalizer.

noop_normalizer

Returns:

Type Description
Dict[str, Set[str]]

Dict[str, Set[str]]: A dictionary mapping normalized names to sets of QIDs.

Source code in rigour/names/person.py
def load_person_names_mapping(
    normalizer: Normalizer = noop_normalizer,
) -> Dict[str, Set[str]]:
    """Load the person QID to name mappings from disk. This is a collection
    of aliases (in various alphabets) of person name parts mapped to a
    Wikidata QID representing that name part.

    Args:
        normalizer (Normalizer, optional): A function to normalize names. Defaults to noop_normalizer.

    Returns:
        Dict[str, Set[str]]: A dictionary mapping normalized names to sets of QIDs.
    """
    names: Dict[str, Set[str]] = {}
    for qid, aliases in load_person_names():
        for alias in aliases:
            norm_alias = normalizer(alias)
            if norm_alias is None:
                continue
            if norm_alias not in names:
                names[norm_alias] = set([qid])
            else:
                names[norm_alias].add(qid)
    return names

pick_case(names)

Pick the best mix of lower- and uppercase characters from a set of names that are identical except for case.

Parameters:

Name Type Description Default
names List[str]

A list of identical names in different cases.

required

Returns:

Type Description
str

Optional[str]: The best name for display.

Source code in rigour/names/pick.py
def pick_case(names: List[str]) -> str:
    """Pick the best mix of lower- and uppercase characters from a set of names
    that are identical except for case.

    Args:
        names (List[str]): A list of identical names in different cases.

    Returns:
        Optional[str]: The best name for display.
    """
    if len(names) == 0:
        raise ValueError("Cannot pick a name from an empty list.")
    if len(names) == 1:
        return names[0]
    reference = names[0].title()
    difference: Dict[str, int] = {n: 0 for n in names}
    for i, char in enumerate(reference):
        for name in names:
            if len(name) <= i:
                raise ValueError("Name length mismatch: %r vs %r" % (name, reference))
            nchar = name[i]
            if nchar != char:
                if nchar.lower() != char.lower():
                    raise ValueError("Names mismatch: %r vs %r" % (name, reference))
                difference[name] += 1
    return min(difference.items(), key=lambda x: x[1])[0]

pick_name(names)

Pick the best name from a list of names. This is meant to pick a centroid name, with a bias towards names in a latin script.

Parameters:

Name Type Description Default
names List[str]

A list of names.

required

Returns:

Type Description
Optional[str]

Optional[str]: The best name for display.

Source code in rigour/names/pick.py
def pick_name(names: List[str]) -> Optional[str]:
    """Pick the best name from a list of names. This is meant to pick a centroid
    name, with a bias towards names in a latin script.

    Args:
        names (List[str]): A list of names.

    Returns:
        Optional[str]: The best name for display.
    """
    weights: Dict[str, float] = defaultdict(float)
    forms: Dict[str, List[str]] = defaultdict(list)
    latin_names: List[str] = []
    for name in sorted(names):
        form = name.strip().lower()
        if len(form) == 0:
            continue
        # even totally non-Latin names have a base weight of 1:
        latin_shr = latin_share(name)
        if latin_shr > 0.9:
            latin_names.append(name)
        weight = 1 + latin_shr
        weights[form] += weight
        forms[form].append(name)
        forms[form].append(name.title())

        norm = ascii_text(form)
        if norm is not None and len(norm):
            weights[norm] += weight
            forms[norm].append(name)

    if len(latin_names) == 1:
        return latin_names[0]

    for form in levenshtein_pick(list(weights.keys()), weights):
        for surface in levenshtein_pick(forms.get(form, []), {}):
            if surface in names:
                return surface
    return None

prenormalize_name(name)

Prepare a name for tokenization and matching.

Source code in rigour/names/tokenize.py
def prenormalize_name(name: Optional[str]) -> str:
    """Prepare a name for tokenization and matching."""
    if name is None:
        return ""
    name = unicodedata.normalize("NFC", name)
    return name.lower()

reduce_names(names)

Select a reduced set of names from a list of names. This is used to prepare the set of names linked to a person, organization, or other entity for publication.

Parameters:

Name Type Description Default
names List[str]

A list of names.

required

Returns:

Type Description
List[str]

List[str]: The reduced list of names.

Source code in rigour/names/pick.py
def reduce_names(names: List[str]) -> List[str]:
    """Select a reduced set of names from a list of names. This is used to
    prepare the set of names linked to a person, organization, or other entity
    for publication.

    Args:
        names (List[str]): A list of names.

    Returns:
        List[str]: The reduced list of names.
    """
    if len(names) < 2:
        return [n for n in names if is_name(n)]
    lower: Dict[str, List[str]] = defaultdict(list)
    for name in names:
        # Filter names that are not valid (e.g. empty or do not contain any letters)
        if not is_name(name):
            log.warning("Invalid name found: %r", name)
            continue
        lower[name.lower()].append(name)
    reduced: List[str] = []
    for group in lower.values():
        try:
            picked = pick_case(group)
            reduced.append(picked)
        except (ValueError, IndexError, KeyError) as e:
            log.warning("Failed to pick case: %s", e)
            # If we cannot pick a case, add all
            reduced.extend(group)
    return reduced

remove_org_prefixes(name)

Remove prefixes like Mr., Mrs., etc.

Source code in rigour/names/prefix.py
def remove_org_prefixes(name: str) -> str:
    """Remove prefixes like Mr., Mrs., etc."""
    return re_prefixes(ORG_NAME_PREFIXES).sub("", name)

remove_org_types(name, replacement='', normalizer=_normalize_compare)

Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and replace it with the given fixed string (empty by default, which signals removal).

Parameters:

Name Type Description Default
name str

The text to be processed. It is assumed to be already normalized (see below).

required
normalizer Callable[[str | None], str | None]

A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.

_normalize_compare

Returns:

Name Type Description
str str

The text with organization types replaced/removed.

Source code in rigour/names/org_types.py
def remove_org_types(
    name: str, replacement: str = "", normalizer: Normalizer = _normalize_compare
) -> str:
    """Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and
    replace it with the given fixed string (empty by default, which signals removal).

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.

    Returns:
        str: The text with organization types replaced/removed.
    """
    replacer = _compare_replacer(normalizer=normalizer)
    return replacer.remove(name, replacement=replacement)

remove_person_prefixes(name)

Remove prefixes like Mr., Mrs., etc.

Source code in rigour/names/prefix.py
def remove_person_prefixes(name: str) -> str:
    """Remove prefixes like Mr., Mrs., etc."""
    return re_prefixes(PERSON_NAME_PREFIXES).sub("", name)

replace_org_types_compare(name, normalizer=_normalize_compare, generic=False)

Replace any organization type indicated in the given entity name (often as a prefix or suffix) with a heavily normalized form label. This will re-write country-specific entity types (eg. GmbH) into a simplified spelling suitable for comparison using string distance. The resulting text is meant to be used in comparison processes, but no longer fit for presentation to a user.

Parameters:

Name Type Description Default
name str

The text to be processed. It is assumed to be already normalized (see below).

required
normalizer Callable[[str | None], str | None]

A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.

_normalize_compare
generic bool

If True, return the generic form of the organization type (e.g. LLC, JSC) instead of the type-specific comparison form (GmbH, AB, NV).

False

Returns:

Type Description
str

Optional[str]: The text with organization types replaced.

Source code in rigour/names/org_types.py
def replace_org_types_compare(
    name: str, normalizer: Normalizer = _normalize_compare, generic: bool = False
) -> str:
    """Replace any organization type indicated in the given entity name (often as a prefix or suffix)
    with a heavily normalized form label. This will re-write country-specific entity types (eg. GmbH)
    into a simplified spelling suitable for comparison using string distance. The resulting text is
    meant to be used in comparison processes, but no longer fit for presentation to a user.

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.
        generic (bool): If True, return the generic form of the organization type (e.g. LLC, JSC) instead
            of the type-specific comparison form (GmbH, AB, NV).

    Returns:
        Optional[str]: The text with organization types replaced.
    """
    _func = _generic_replacer if generic else _compare_replacer
    replacer = _func(normalizer=normalizer)
    return replacer(name) or name

replace_org_types_display(name, normalizer=normalize_display)

Replace organization types in the text with their shortened form. This will perform a display-safe (light) form of normalization, useful for shortening spelt-out legal forms into common abbreviations (eg. Siemens Aktiengesellschaft -> Siemens AG).

If the result of the replacement yields an empty string, the original text is returned as-is.

Parameters:

Name Type Description Default
name str

The text to be processed. It is assumed to be already normalized (see below).

required
normalizer Callable[[str | None], str | None]

A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.

normalize_display

Returns:

Type Description
str

Optional[str]: The text with organization types replaced.

Source code in rigour/names/org_types.py
def replace_org_types_display(
    name: str, normalizer: Normalizer = normalize_display
) -> str:
    """Replace organization types in the text with their shortened form. This will perform
    a display-safe (light) form of normalization, useful for shortening spelt-out legal forms
    into common abbreviations (eg. Siemens Aktiengesellschaft -> Siemens AG).

    If the result of the replacement yields an empty string, the original text is returned as-is.

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.

    Returns:
        Optional[str]: The text with organization types replaced.
    """
    is_uppercase = name.isupper()
    replacer = _display_replacer(normalizer=normalizer)
    out_text = replacer(name)
    if out_text is None:
        return name
    if is_uppercase:
        out_text = out_text.upper()
    return out_text

tag_org_name(name, normalizer)

Tag the name with the organization type and symbol tags.

Source code in rigour/names/tagging.py
def tag_org_name(name: Name, normalizer: Normalizer) -> Name:
    """Tag the name with the organization type and symbol tags."""
    tagger = _get_org_tagger(normalizer)
    for phrase, symbol in tagger(name.norm_form):
        name.apply_phrase(phrase, symbol)
    return _infer_part_tags(name)

tag_person_name(name, normalizer, any_initials=False)

Tag a person's name with the person name part and other symbol tags.

Source code in rigour/names/tagging.py
def tag_person_name(
    name: Name, normalizer: Normalizer, any_initials: bool = False
) -> Name:
    """Tag a person's name with the person name part and other symbol tags."""
    # tag given name abbreviations. this is meant to handle a case where the person's
    # first or middle name is an abbreviation, e.g. "J. Smith" or "John Q. Smith"
    for part in name.parts:
        if not part.is_modern_alphabet:
            continue
        sym = Symbol(Symbol.Category.INITIAL, part.comparable[0])
        if any_initials and len(part.form) == 1:
            name.apply_part(part, sym)
        elif part.tag in GIVEN_NAME_TAGS:
            name.apply_part(part, sym)

    # tag the name with person symbols
    tagger = _get_person_tagger(normalizer)
    for phrase, symbol in tagger(name.norm_form):
        name.apply_phrase(phrase, symbol)

    return _infer_part_tags(name)

tokenize_name(text, token_min_length=1)

Split a person or entity's name into name parts.

Source code in rigour/names/tokenize.py
def tokenize_name(text: str, token_min_length: int = 1) -> List[str]:
    """Split a person or entity's name into name parts."""
    # FIXME: Do we want to support CJK scripts at some stage?
    tokens: List[str] = []
    token: List[str] = []
    # TODO: Do we want to do some form of unicode normalization here?
    # text = unicodedata.normalize("NFC", text)
    for char in text:
        if char in ".'’":
            continue
        cat = unicodedata.category(char)
        chr = TOKEN_SEP_CATEGORIES.get(cat, char)
        if chr is None:
            continue
        if chr == WS:
            if len(token) >= token_min_length:
                tokens.append("".join(token))
            token.clear()
            continue
        token.append(chr)

    if len(token) >= token_min_length:
        tokens.append("".join(token))
    return tokens