Names

`rigour.names`

Name handling utilities for person and organisation names. This module contains a large (and growing) set of tools for handling names. In general, there are three types of names: people, organizations, and objects. Different normalization may be required for each of these types, including prefix removal for person names (e.g. "Mr." or "Ms.") and type normalization for organization names (e.g. "Incorporated" -> "Inc" or "Limited" -> "Ltd").

The Name class is meant to provide a structure for a name, including its original form, normalized form, metadata on the type of thing described by the name, and the language of the name. The NamePart class is used to represent individual parts of a name, such as the first name, middle name, and last name.

Falsehoods Programmers Believe About Names

`Name`

Bases: object

A name of a thing, such as a person, organization or object. Each name consists of a sequence of parts, each of which has a form and a tag. The form is the text of the part, and the tag is a label indicating the type of part. For example, in the name "John Smith", "John" is a given name and "Smith" is a family name. The tag for "John" would be NamePartTag.GIVEN and the tag for "Smith" would be NamePartTag.FAMILY. The form for both parts would be the text of the part itself.

Source code in rigour/names/name.py

class Name(object):
    """A name of a thing, such as a person, organization or object. Each name consists of a
    sequence of parts, each of which has a form and a tag. The form is the text of the part, and the tag
    is a label indicating the type of part. For example, in the name "John Smith", "John" is a given name
    and "Smith" is a family name. The tag for "John" would be `NamePartTag.GIVEN` and the tag for "Smith"
    would be `NamePartTag.FAMILY`. The form for both parts would be the text of the part itself.
    """

    __slots__ = ["original", "form", "tag", "lang", "_parts", "spans"]

    def __init__(
        self,
        original: str,
        form: Optional[str] = None,
        tag: NameTypeTag = NameTypeTag.UNK,
        lang: Optional[str] = None,
        parts: Optional[List[NamePart]] = None,
    ):
        self.original = original
        self.form = form or prenormalize_name(original)
        self.tag = tag
        self.lang = lang
        self._parts = parts
        self.spans: List[Span] = []

    @property
    def parts(self) -> List[NamePart]:
        if self._parts is None:
            self._parts = []
            for i, form in enumerate(tokenize_name(self.form)):
                self._parts.append(NamePart(form, i))
        return self._parts

    @property
    def comparable(self) -> str:
        """Return the ASCII representation of the name, if available."""
        return " ".join(part.comparable for part in self.parts)

    @property
    def norm_form(self) -> str:
        """Return the normalized form of the name by joining name parts."""
        return " ".join([part.form for part in self.parts])

    def tag_text(self, text: str, tag: NamePartTag, max_matches: int = 1) -> None:
        tokens = tokenize_name(prenormalize_name(text))
        matches = 0
        matching: List[NamePart] = []
        for part in self.parts:
            next_token = tokens[len(matching)]
            if part.form == next_token:
                matching.append(part)
            if len(matching) == len(tokens):
                for part in matching:
                    if part.tag == NamePartTag.ANY:
                        part.tag = tag
                    elif not part.tag.can_match(tag):
                        # if the part is already tagged, we check compatibility and
                        # possibly revert to the basic type
                        part.tag = NamePartTag.ANY
                matches += 1
                if matches >= max_matches:
                    return
                matching = []

    def apply_phrase(self, phrase: str, symbol: Symbol) -> None:
        """Apply a symbol to a phrase in the name."""
        matching: List[NamePart] = []
        tokens = phrase.split(" ")
        for part in self.parts:
            next_token = tokens[len(matching)]
            if part.form == next_token:
                matching.append(part)
            if len(matching) == len(tokens):
                self.spans.append(Span(matching, symbol))
                matching = []

    def apply_part(self, part: NamePart, symbol: Symbol) -> None:
        """Apply a symbol to a part of the name."""
        self.spans.append(Span([part], symbol))

    @property
    def symbols(self) -> Set[Symbol]:
        """Return a dictionary of symbols applied to the name."""
        symbols: Set[Symbol] = set()
        for span in self.spans:
            symbols.add(span.symbol)
        return symbols

    def contains(self, other: "Name") -> bool:
        """Check if this name contains another name."""
        if self == other or self.tag == NameTypeTag.UNK:
            return False
        if len(self.parts) < len(other.parts):
            return False

        if self.tag == NameTypeTag.PER:
            forms = [part.comparable for part in self.parts]
            other_forms = [part.comparable for part in other.parts]
            common_forms = list_intersection(forms, other_forms)

            # we want to make this support middle initials so that
            # "John Smith" can match "J. Smith"
            for ospan in other.spans:
                if ospan.symbol.category == Symbol.Category.INITIAL:
                    if len(ospan.parts[0].form) > 1:
                        continue
                    for span in self.spans:
                        if span.symbol == ospan.symbol:
                            common_forms.append(ospan.comparable)

            # If every part of the other name is represented in the common forms,
            # we consider it a match.
            if len(common_forms) == len(other_forms):
                return True

        return other.norm_form in self.norm_form

    def __eq__(self, other: Any) -> bool:
        try:
            return self.form == other.form  # type: ignore
        except AttributeError:
            return False

    def __hash__(self) -> int:
        return hash(self.form)

    def __str__(self) -> str:
        return self.original

    def __repr__(self) -> str:
        return "<Name(%r, %r, %r)>" % (self.original, self.form, self.tag.value)

`comparable` `property`

Return the ASCII representation of the name, if available.

`norm_form` `property`

Return the normalized form of the name by joining name parts.

`symbols` `property`

Return a dictionary of symbols applied to the name.

`apply_part(part, symbol)`

Apply a symbol to a part of the name.

Source code in rigour/names/name.py

def apply_part(self, part: NamePart, symbol: Symbol) -> None:
    """Apply a symbol to a part of the name."""
    self.spans.append(Span([part], symbol))

`apply_phrase(phrase, symbol)`

Apply a symbol to a phrase in the name.

Source code in rigour/names/name.py

def apply_phrase(self, phrase: str, symbol: Symbol) -> None:
    """Apply a symbol to a phrase in the name."""
    matching: List[NamePart] = []
    tokens = phrase.split(" ")
    for part in self.parts:
        next_token = tokens[len(matching)]
        if part.form == next_token:
            matching.append(part)
        if len(matching) == len(tokens):
            self.spans.append(Span(matching, symbol))
            matching = []

`contains(other)`

Check if this name contains another name.

Source code in rigour/names/name.py

def contains(self, other: "Name") -> bool:
    """Check if this name contains another name."""
    if self == other or self.tag == NameTypeTag.UNK:
        return False
    if len(self.parts) < len(other.parts):
        return False

    if self.tag == NameTypeTag.PER:
        forms = [part.comparable for part in self.parts]
        other_forms = [part.comparable for part in other.parts]
        common_forms = list_intersection(forms, other_forms)

        # we want to make this support middle initials so that
        # "John Smith" can match "J. Smith"
        for ospan in other.spans:
            if ospan.symbol.category == Symbol.Category.INITIAL:
                if len(ospan.parts[0].form) > 1:
                    continue
                for span in self.spans:
                    if span.symbol == ospan.symbol:
                        common_forms.append(ospan.comparable)

        # If every part of the other name is represented in the common forms,
        # we consider it a match.
        if len(common_forms) == len(other_forms):
            return True

    return other.norm_form in self.norm_form

`NamePart`

Bases: object

A part of a name, such as a given name or family name. This object is used to compare and match names. It generates and caches representations of the name in various processing forms.

Source code in rigour/names/part.py

class NamePart(object):
    """A part of a name, such as a given name or family name. This object is used to compare
    and match names. It generates and caches representations of the name in various processing
    forms."""

    __slots__ = ["form", "index", "tag", "latinize", "_ascii", "_hash"]

    def __init__(
        self,
        form: str,
        index: Optional[int] = None,
        tag: NamePartTag = NamePartTag.ANY,
    ) -> None:
        self.form = form
        self.index = index
        self.tag = tag
        self.latinize = can_latinize(form)
        self._ascii: Optional[str] = None
        self._hash = hash((self.index, self.form))

    @property
    def ascii(self) -> Optional[str]:
        if self._ascii is None:
            out = ascii_text(self.form)
            self._ascii = "".join(o for o in out if o.isalnum())
        return self._ascii if len(self._ascii) > 0 else None

    @property
    def comparable(self) -> str:
        if not self.latinize:
            return self.form
        ascii = self.ascii
        if ascii is None:
            return self.form
        return ascii

    @property
    def metaphone(self) -> Optional[str]:
        if self.latinize:
            ascii_form = self.ascii
            if ascii_form is not None and len(ascii_form) > 2:
                return metaphone(ascii_form)
        return None

    def can_match(self, other: "NamePart") -> bool:
        """Check if this part can match another part. This is based on the tags of the parts."""
        return self.tag.can_match(other.tag)

    def __eq__(self, other: Any) -> bool:
        try:
            return other._hash == self._hash  # type: ignore
        except AttributeError:
            return False

    def __hash__(self) -> int:
        return self._hash

    def __len__(self) -> int:
        return len(self.form)

    def __repr__(self) -> str:
        return "<NamePart(%r, %s, %r)>" % (self.form, self.index, self.tag.value)

    @classmethod
    def tag_sort(cls, parts: list["NamePart"]) -> list["NamePart"]:
        """Sort name parts by their index."""
        return sorted(parts, key=lambda np: NAME_TAGS_ORDER.index(np.tag))

`can_match(other)`

Check if this part can match another part. This is based on the tags of the parts.

Source code in rigour/names/part.py

def can_match(self, other: "NamePart") -> bool:
    """Check if this part can match another part. This is based on the tags of the parts."""
    return self.tag.can_match(other.tag)

`tag_sort(parts)` `classmethod`

Sort name parts by their index.

Source code in rigour/names/part.py

@classmethod
def tag_sort(cls, parts: list["NamePart"]) -> list["NamePart"]:
    """Sort name parts by their index."""
    return sorted(parts, key=lambda np: NAME_TAGS_ORDER.index(np.tag))

`NamePartTag`

Bases: Enum

Within a name, identify name part types.

Source code in rigour/names/tag.py

class NamePartTag(Enum):
    """Within a name, identify name part types."""

    ANY = "ANY"

    TITLE = "TITLE"
    GIVEN = "GIVEN"
    MIDDLE = "MIDDLE"
    FAMILY = "FAMILY"
    TRIBAL = "TRIBAL"
    PATRONYMIC = "PATRONYMIC"
    MATRONYMIC = "MATRONYMIC"
    HONORIFIC = "HONORIFIC"
    SUFFIX = "SUFFIX"
    NICK = "NICK"

    STOP = "STOP"  # Stopword
    NUM = "NUM"
    LEGAL = "LEGAL"  # Legal form of an organisation

    def can_match(self, other: "NamePartTag") -> bool:
        """Check if this tag can match the other tag."""
        if self == NamePartTag.ANY or other == NamePartTag.ANY:
            return True
        if self == other:
            return True
        if self in GIVEN_NAME_TAGS and other not in GIVEN_NAME_TAGS:
            return False
        if self in FAMILY_NAME_TAGS and other not in FAMILY_NAME_TAGS:
            return False
        return True

`can_match(other)`

Check if this tag can match the other tag.

Source code in rigour/names/tag.py

def can_match(self, other: "NamePartTag") -> bool:
    """Check if this tag can match the other tag."""
    if self == NamePartTag.ANY or other == NamePartTag.ANY:
        return True
    if self == other:
        return True
    if self in GIVEN_NAME_TAGS and other not in GIVEN_NAME_TAGS:
        return False
    if self in FAMILY_NAME_TAGS and other not in FAMILY_NAME_TAGS:
        return False
    return True

`NameTypeTag`

Bases: Enum

Metadata on what sort of object is described by a name

Source code in rigour/names/tag.py

class NameTypeTag(Enum):
    """Metadata on what sort of object is described by a name"""

    UNK = "UNK"  # Unknown
    ENT = "ENT"  # Entity
    PER = "PER"  # Person
    ORG = "ORG"  # Organization/Company
    OBJ = "OBJ"  # Object - Vessel, Security, etc.

`Span`

A span is a set of parts of a name that have been tagged with a symbol.

Source code in rigour/names/part.py

class Span:
    """A span is a set of parts of a name that have been tagged with a symbol."""

    __slots__ = ["parts", "symbol"]

    def __init__(self, parts: List[NamePart], symbol: Symbol) -> None:
        self.parts = tuple(parts)
        self.symbol = symbol

    @property
    def comparable(self) -> str:
        """Return the comparison-suited string representation of the span."""
        return " ".join([part.comparable for part in self.parts])

    def __len__(self) -> int:
        """Return the number of parts in the span."""
        return sum(len(part) for part in self.parts)

    def __hash__(self) -> int:
        return hash((self.parts, self.symbol))

    def __eq__(self, other: Any) -> bool:
        return hash(self) == hash(other)

    def __repr__(self) -> str:
        return f"<Span({self.parts!r}, {self.symbol})>"

`comparable` `property`

Return the comparison-suited string representation of the span.

`len()`

Return the number of parts in the span.

Source code in rigour/names/part.py

def __len__(self) -> int:
    """Return the number of parts in the span."""
    return sum(len(part) for part in self.parts)

`Symbol`

A symbol is a semantic interpretation applied to one or more parts of a name. Symbols can represent various categories such as organization classes, initials, names, numeric, or phonetic transcriptions. Each symbol has a category and an identifier.

Source code in rigour/names/symbol.py

class Symbol:
    """A symbol is a semantic interpretation applied to one or more parts of a name. Symbols can
    represent various categories such as organization classes, initials, names, numeric, or phonetic
    transcriptions. Each symbol has a category and an identifier."""

    class Category(Enum):
        # ORG_TYPE = "ORGTYPE"
        ORG_CLASS = "ORGCLS"
        SYMBOL = "SYMBOL"
        INITIAL = "INITIAL"
        NAME = "NAME"
        NUMERIC = "NUM"
        LOCATION = "LOC"
        PHONETIC = "PHON"

    __slots__ = ["category", "id"]

    def __init__(self, category: Category, id: Any) -> None:
        """Create a symbol with a category and an id."""
        self.category = category
        self.id = id

    def __hash__(self) -> int:
        return hash((self.category, self.id))

    def __eq__(self, other: Any) -> bool:
        if not isinstance(other, Symbol):
            return False
        return self.category == other.category and self.id == other.id

    def __str__(self) -> str:
        return f"[{self.category.value}:{self.id}]"

    def __repr__(self) -> str:
        return f"<Symbol({self.category}, {self.id})>"

`init(category, id)`

Create a symbol with a category and an id.

Source code in rigour/names/symbol.py

def __init__(self, category: Category, id: Any) -> None:
    """Create a symbol with a category and an id."""
    self.category = category
    self.id = id

`align_person_name_order(left, right)`

Aligns the name parts of a person name for two names based on their tags and their string similarity such that the most similar name parts are matched.

Parameters:

Name	Type	Description	Default
`left`	`List[NamePart]`	The name parts of the first name.	required
`right`	`List[NamePart]`	The name parts of the second name.	required

Returns:

Type	Description
`Tuple[List[NamePart], List[NamePart]]`	Tuple[List[NamePart], List[NamePart]]: A tuple containing the sorted name parts of both names.

Source code in rigour/names/alignment.py

def align_person_name_order(
    left: List[NamePart], right: List[NamePart]
) -> Tuple[List[NamePart], List[NamePart]]:
    """Aligns the name parts of a person name for two names based on their tags and their string
    similarity such that the most similar name parts are matched.

    Args:
        left (List[NamePart]): The name parts of the first name.
        right (List[NamePart]): The name parts of the second name.

    Returns:
        Tuple[List[NamePart], List[NamePart]]: A tuple containing the sorted name parts of both names.
    """
    if not len(left):
        return (left, NamePart.tag_sort(right))

    left_sorted: List[NamePart] = []
    right_sorted: List[NamePart] = []

    left_unused = sorted(left, key=len, reverse=True)
    right_unused = sorted(right, key=len, reverse=True)
    while len(left_unused) > 0 and len(right_unused) > 0:
        best_score = 0.0
        best_left_parts: Optional[List[NamePart]] = None
        best_right_parts: Optional[List[NamePart]] = None
        for qp, rp in product(left_unused, right_unused):
            if not qp.can_match(rp):
                continue
            if qp.comparable == rp.comparable:
                best_score = 1.0
                best_left_parts = [qp]
                best_right_parts = [rp]
                break
            # check the Levenshtein distance between the two parts
            score = _name_levenshtein([qp], [rp])
            if score > best_score:
                best_left_parts = [qp]
                best_right_parts = [rp]
                if len(qp.form) > len(rp.form):
                    best_right_parts = _pack_short_parts(qp, rp, right_unused)
                elif len(rp.form) > len(qp.form):
                    best_left_parts = _pack_short_parts(rp, qp, left_unused)
                best_score = _name_levenshtein(best_left_parts, best_right_parts)

        if best_score == 0.0:
            # no match found, break out of the loop
            break

        if best_left_parts is not None:
            left_sorted.extend(best_left_parts)
            for qp in best_left_parts:
                left_unused.remove(qp)
        if best_right_parts is not None:
            right_sorted.extend(best_right_parts)
            for rp in best_right_parts:
                right_unused.remove(rp)

    if not len(left_sorted):
        return (NamePart.tag_sort(left), NamePart.tag_sort(right))

    left_sorted.extend(left_unused)
    right_sorted.extend(right_unused)
    return (left_sorted, right_sorted)

`extract_org_types(name, normalizer=_normalize_compare, generic=False)`

Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and return the extracted type.

This can be used as a very poor man's method to determine if a given string is a company name.

Parameters:

Name	Type	Description	Default
`name`	`str`	The text to be processed. It is assumed to be already normalized (see below).	required
`normalizer`	`Callable[[str \| None], str \| None]`	A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.	`_normalize_compare`
`generic`	`bool`	If True, return the generic form of the organization type (e.g. LLC, JSC) instead of the type-specific comparison form (GmbH, AB, NV).	`False`

Returns:

Type	Description
`List[Tuple[str, str]]`	Tuple[str, str]: Tuple of the org type as matched, and the compare form of it.

Source code in rigour/names/org_types.py

def extract_org_types(
    name: str, normalizer: Normalizer = _normalize_compare, generic: bool = False
) -> List[Tuple[str, str]]:
    """Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and
    return the extracted type.

    This can be used as a very poor man's method to determine if a given string is a company name.

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.
        generic (bool): If True, return the generic form of the organization type (e.g. LLC, JSC) instead
            of the type-specific comparison form (GmbH, AB, NV).

    Returns:
        Tuple[str, str]: Tuple of the org type as matched, and the compare form of it.
    """
    _func = _generic_replacer if generic else _compare_replacer
    replacer = _func(normalizer=normalizer)
    matches: List[Tuple[str, str]] = []
    for matched in replacer.extract(name):
        matches.append((matched, replacer.mapping.get(matched, matched)))
    return matches

`is_name(name)`

Check if the given string is a name. The string is considered a name if it contains at least one character that is a letter (category 'L' in Unicode).

Source code in rigour/names/check.py

def is_name(name: str) -> bool:
    """Check if the given string is a name. The string is considered a name if it contains at least
    one character that is a letter (category 'L' in Unicode)."""
    for char in name:
        category = unicodedata.category(char)
        if category[0] == "L":
            return True
    return False

`is_stopword(form, normalizer=normalize_name)`

Check if the given form is a stopword. The stopword list is normalized first.

Parameters:

Name	Type	Description	Default
`form`	`str`	The token to check, must already be normalized.	required
`normalizer`	`Normalizer`	The normalizer to use for checking stopwords.	`normalize_name`

Returns:

Name	Type	Description
`bool`	`bool`	True if the form is a stopword, False otherwise.

Source code in rigour/names/check.py

def is_stopword(form: str, normalizer: Normalizer = normalize_name) -> bool:
    """Check if the given form is a stopword. The stopword list is normalized first.

    Args:
        form (str): The token to check, must already be normalized.
        normalizer (Normalizer): The normalizer to use for checking stopwords.

    Returns:
        bool: True if the form is a stopword, False otherwise.
    """
    stopwords = _load_stopwords(normalizer)
    return form in stopwords

`load_person_names()`

Load the person QID to name mappings from disk. This is a collection of aliases (in various alphabets) of person name parts mapped to a Wikidata QID representing that name part.

Returns:

Type	Description
`None`	Generator[Tuple[str, List[str]], None, None]: A generator yielding tuples of QID and list of names.

Source code in rigour/names/person.py

def load_person_names() -> Generator[Tuple[str, List[str]], None, None]:
    """Load the person QID to name mappings from disk. This is a collection
    of aliases (in various alphabets) of person name parts mapped to a
    Wikidata QID representing that name part.

    Returns:
        Generator[Tuple[str, List[str]], None, None]: A generator yielding tuples of QID and list of names.
    """
    with open(NAMES_DATA_PATH, "r", encoding="utf-8") as fh:
        for line in fh:
            line = line.strip()
            names_, qid = line.split(" => ")
            names = names_.split(", ")
            yield qid, names

`load_person_names_mapping(normalizer=noop_normalizer, min_mappings=1)`

Load the person QID to name mappings from disk. This is a collection of aliases (in various alphabets) of person name parts mapped to a Wikidata QID representing that name part.

Parameters:

Name	Type	Description	Default
`normalizer`	`Normalizer`	A function to normalize names. Defaults to noop_normalizer.	`noop_normalizer`

Returns:

Type	Description
`Dict[str, Set[str]]`	Dict[str, Set[str]]: A dictionary mapping normalized names to sets of QIDs.

Source code in rigour/names/person.py

def load_person_names_mapping(
    normalizer: Normalizer = noop_normalizer, min_mappings: int = 1
) -> Dict[str, Set[str]]:
    """Load the person QID to name mappings from disk. This is a collection
    of aliases (in various alphabets) of person name parts mapped to a
    Wikidata QID representing that name part.

    Args:
        normalizer (Normalizer, optional): A function to normalize names. Defaults to noop_normalizer.

    Returns:
        Dict[str, Set[str]]: A dictionary mapping normalized names to sets of QIDs.
    """
    names: Dict[str, Set[str]] = {}
    for qid, aliases in load_person_names():
        forms: Set[str] = set()
        for alias in aliases:
            norm_alias = normalizer(alias)
            if norm_alias is None or not len(norm_alias):
                continue
            forms.add(norm_alias)
        if len(forms) < min_mappings:
            continue
        for form in forms:
            if form not in names:
                names[form] = set([qid])
            else:
                names[form].add(qid)
    return names

`normalize_name(name, sep=WS)` `cached`

Normalize a name for tokenization and matching.

Source code in rigour/names/tokenize.py

@lru_cache(maxsize=MEMO_TINY)
def normalize_name(name: Optional[str], sep: str = WS) -> Optional[str]:
    """Normalize a name for tokenization and matching."""
    if name is None:
        return None
    name = prenormalize_name(name)
    joined = sep.join(tokenize_name(name))
    if len(joined) == 0:
        return None
    return joined

`pick_case(names)`

Pick the best mix of lower- and uppercase characters from a set of names that are identical except for case.

Parameters:

Name	Type	Description	Default
`names`	`List[str]`	A list of identical names in different cases.	required

Returns:

Type	Description
`str`	Optional[str]: The best name for display.

Source code in rigour/names/pick.py

def pick_case(names: List[str]) -> str:
    """Pick the best mix of lower- and uppercase characters from a set of names
    that are identical except for case.

    Args:
        names (List[str]): A list of identical names in different cases.

    Returns:
        Optional[str]: The best name for display.
    """
    if len(names) == 0:
        raise ValueError("Cannot pick a name from an empty list.")
    if len(names) == 1:
        return names[0]
    reference = names[0].title()
    difference: Dict[str, int] = {n: 0 for n in names}
    for i, char in enumerate(reference):
        for name in names:
            if len(name) <= i:
                raise ValueError("Name length mismatch: %r vs %r" % (name, reference))
            nchar = name[i]
            if nchar != char:
                if nchar.lower() != char.lower():
                    raise ValueError("Names mismatch: %r vs %r" % (name, reference))
                difference[name] += 1
    return min(difference.items(), key=lambda x: x[1])[0]

`pick_name(names)`

Pick the best name from a list of names. This is meant to pick a centroid name, with a bias towards names in a latin script.

Parameters:

Name	Type	Description	Default
`names`	`List[str]`	A list of names.	required

Returns:

Type	Description
`Optional[str]`	Optional[str]: The best name for display.

Source code in rigour/names/pick.py

def pick_name(names: List[str]) -> Optional[str]:
    """Pick the best name from a list of names. This is meant to pick a centroid
    name, with a bias towards names in a latin script.

    Args:
        names (List[str]): A list of names.

    Returns:
        Optional[str]: The best name for display.
    """
    weights: Dict[str, float] = defaultdict(float)
    forms: Dict[str, List[str]] = defaultdict(list)
    latin_names: List[str] = []
    for name in sorted(names):
        form = name.strip().lower()
        if len(form) == 0:
            continue
        # even totally non-Latin names have a base weight of 1:
        latin_shr = latin_share(name)
        if latin_shr > 0.85:
            latin_names.append(name)
        weight = 1 + latin_shr
        weights[form] += weight
        forms[form].append(name)
        forms[form].append(name.title())

        norm = ascii_text(form)
        if len(norm) > 2:
            weights[norm] += weight
            forms[norm].append(name)

    if len(latin_names) == 1:
        return latin_names[0]

    for form in levenshtein_pick(list(weights.keys()), weights):
        for surface in levenshtein_pick(forms.get(form, []), {}):
            if surface in names:
                return surface
    return None

`prenormalize_name(name)`

Prepare a name for tokenization and matching.

Source code in rigour/names/tokenize.py

def prenormalize_name(name: Optional[str]) -> str:
    """Prepare a name for tokenization and matching."""
    if name is None:
        return ""
    # name = unicodedata.normalize("NFC", name)
    return name.lower()

`reduce_names(names)`

Select a reduced set of names from a list of names. This is used to prepare the set of names linked to a person, organization, or other entity for publication.

Parameters:

Name	Type	Description	Default
`names`	`List[str]`	A list of names.	required

Returns:

Type	Description
`List[str]`	List[str]: The reduced list of names.

Source code in rigour/names/pick.py

def reduce_names(names: List[str]) -> List[str]:
    """Select a reduced set of names from a list of names. This is used to
    prepare the set of names linked to a person, organization, or other entity
    for publication.

    Args:
        names (List[str]): A list of names.

    Returns:
        List[str]: The reduced list of names.
    """
    if len(names) < 2:
        return [n for n in names if is_name(n)]
    lower: Dict[str, List[str]] = defaultdict(list)
    for name in names:
        # Filter names that are not valid (e.g. empty or do not contain any letters)
        if not is_name(name):
            log.warning("Invalid name found: %r", name)
            continue
        lower[name.lower()].append(name)
    reduced: List[str] = []
    for group in lower.values():
        try:
            picked = pick_case(group)
            reduced.append(picked)
        except (ValueError, IndexError, KeyError) as e:
            log.warning("Failed to pick case: %s", e)
            # If we cannot pick a case, add all
            reduced.extend(group)
    return reduced

`remove_obj_prefixes(name)`

Remove prefixes like The, MV, etc.

Source code in rigour/names/prefix.py

def remove_obj_prefixes(name: str) -> str:
    """Remove prefixes like The, MV, etc."""
    return re_prefixes(OBJ_NAME_PREFIXES).sub("", name)

`remove_org_prefixes(name)`

Remove prefixes like Mr., Mrs., etc.

Source code in rigour/names/prefix.py

def remove_org_prefixes(name: str) -> str:
    """Remove prefixes like Mr., Mrs., etc."""
    return re_prefixes(ORG_NAME_PREFIXES).sub("", name)

`remove_org_types(name, replacement='', normalizer=_normalize_compare)`

Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and replace it with the given fixed string (empty by default, which signals removal).

Parameters:

Name	Type	Description	Default
`name`	`str`	The text to be processed. It is assumed to be already normalized (see below).	required
`normalizer`	`Callable[[str \| None], str \| None]`	A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.	`_normalize_compare`

Returns:

Name	Type	Description
`str`	`str`	The text with organization types replaced/removed.

Source code in rigour/names/org_types.py

def remove_org_types(
    name: str, replacement: str = "", normalizer: Normalizer = _normalize_compare
) -> str:
    """Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and
    replace it with the given fixed string (empty by default, which signals removal).

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.

    Returns:
        str: The text with organization types replaced/removed.
    """
    replacer = _compare_replacer(normalizer=normalizer)
    return replacer.remove(name, replacement=replacement)

`remove_person_prefixes(name)`

Remove prefixes like Mr., Mrs., etc.

Source code in rigour/names/prefix.py

def remove_person_prefixes(name: str) -> str:
    """Remove prefixes like Mr., Mrs., etc."""
    return re_prefixes(PERSON_NAME_PREFIXES).sub("", name)

`replace_org_types_compare(name, normalizer=_normalize_compare, generic=False)`

Replace any organization type indicated in the given entity name (often as a prefix or suffix) with a heavily normalized form label. This will re-write country-specific entity types (eg. GmbH) into a simplified spelling suitable for comparison using string distance. The resulting text is meant to be used in comparison processes, but no longer fit for presentation to a user.

Parameters:

Name	Type	Description	Default
`name`	`str`	The text to be processed. It is assumed to be already normalized (see below).	required
`normalizer`	`Callable[[str \| None], str \| None]`	A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.	`_normalize_compare`
`generic`	`bool`	If True, return the generic form of the organization type (e.g. LLC, JSC) instead of the type-specific comparison form (GmbH, AB, NV).	`False`

Returns:

Type	Description
`str`	Optional[str]: The text with organization types replaced.

Source code in rigour/names/org_types.py

def replace_org_types_compare(
    name: str, normalizer: Normalizer = _normalize_compare, generic: bool = False
) -> str:
    """Replace any organization type indicated in the given entity name (often as a prefix or suffix)
    with a heavily normalized form label. This will re-write country-specific entity types (eg. GmbH)
    into a simplified spelling suitable for comparison using string distance. The resulting text is
    meant to be used in comparison processes, but no longer fit for presentation to a user.

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.
        generic (bool): If True, return the generic form of the organization type (e.g. LLC, JSC) instead
            of the type-specific comparison form (GmbH, AB, NV).

    Returns:
        Optional[str]: The text with organization types replaced.
    """
    _func = _generic_replacer if generic else _compare_replacer
    replacer = _func(normalizer=normalizer)
    return replacer(name) or name

`replace_org_types_display(name, normalizer=normalize_display)`

Replace organization types in the text with their shortened form. This will perform a display-safe (light) form of normalization, useful for shortening spelt-out legal forms into common abbreviations (eg. Siemens Aktiengesellschaft -> Siemens AG).

If the result of the replacement yields an empty string, the original text is returned as-is.

Parameters:

Name	Type	Description	Default
`name`	`str`	The text to be processed. It is assumed to be already normalized (see below).	required
`normalizer`	`Callable[[str \| None], str \| None]`	A text normalization function to run on the lookup values before matching to remove text anomalies and make matches more likely.	`normalize_display`

Returns:

Type	Description
`str`	Optional[str]: The text with organization types replaced.

Source code in rigour/names/org_types.py

def replace_org_types_display(
    name: str, normalizer: Normalizer = normalize_display
) -> str:
    """Replace organization types in the text with their shortened form. This will perform
    a display-safe (light) form of normalization, useful for shortening spelt-out legal forms
    into common abbreviations (eg. Siemens Aktiengesellschaft -> Siemens AG).

    If the result of the replacement yields an empty string, the original text is returned as-is.

    Args:
        name (str): The text to be processed. It is assumed to be already normalized (see below).
        normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
            lookup values before matching to remove text anomalies and make matches more likely.

    Returns:
        Optional[str]: The text with organization types replaced.
    """
    is_uppercase = name.isupper()
    replacer = _display_replacer(normalizer=normalizer)
    out_text = replacer(name)
    if out_text is None:
        return name
    if is_uppercase:
        out_text = out_text.upper()
    return out_text

`tag_org_name(name, normalizer)`

Tag the name with the organization type and symbol tags.

Source code in rigour/names/tagging.py

def tag_org_name(name: Name, normalizer: Normalizer) -> Name:
    """Tag the name with the organization type and symbol tags."""
    tagger = _get_org_tagger(normalizer)
    for phrase, symbol in tagger(name.norm_form):
        name.apply_phrase(phrase, symbol)
    return _infer_part_tags(name)

`tag_person_name(name, normalizer, any_initials=False)`

Tag a person's name with the person name part and other symbol tags.

Source code in rigour/names/tagging.py

def tag_person_name(
    name: Name, normalizer: Normalizer, any_initials: bool = False
) -> Name:
    """Tag a person's name with the person name part and other symbol tags."""
    # tag given name abbreviations. this is meant to handle a case where the person's
    # first or middle name is an abbreviation, e.g. "J. Smith" or "John Q. Smith"
    for part in name.parts:
        if not part.latinize:
            continue
        sym = Symbol(Symbol.Category.INITIAL, part.comparable[0])
        if any_initials and len(part.form) == 1:
            name.apply_part(part, sym)
        elif part.tag in GIVEN_NAME_TAGS:
            name.apply_part(part, sym)

    # tag the name with person symbols
    tagger = _get_person_tagger(normalizer)
    for phrase, symbol in tagger(name.norm_form):
        name.apply_phrase(phrase, symbol)

    return _infer_part_tags(name)

`tokenize_name(text, token_min_length=1)`

Split a person or entity's name into name parts.

Source code in rigour/names/tokenize.py

def tokenize_name(text: str, token_min_length: int = 1) -> List[str]:
    """Split a person or entity's name into name parts."""
    # FIXME: Do we want to support CJK scripts at some stage?
    tokens: List[str] = []
    token: List[str] = []
    # TODO: Do we want to do some form of unicode normalization here?
    # text = unicodedata.normalize("NFC", text)
    for char in text:
        if char in SKIP_CHARACTERS:
            continue
        cat = unicodedata.category(char)
        chr = TOKEN_SEP_CATEGORIES.get(cat, char)
        if chr is None:
            continue
        if chr == WS:
            if len(token) >= token_min_length:
                tokens.append("".join(token))
            token.clear()
            continue
        token.append(chr)

    if len(token) >= token_min_length:
        tokens.append("".join(token))
    return tokens

Names

rigour.names

Name

comparable property

norm_form property

symbols property

apply_part(part, symbol)

apply_phrase(phrase, symbol)

contains(other)

NamePart

can_match(other)

tag_sort(parts) classmethod

NamePartTag

can_match(other)

NameTypeTag

Span

comparable property

__len__()

Symbol

__init__(category, id)

align_person_name_order(left, right)

extract_org_types(name, normalizer=_normalize_compare, generic=False)

is_name(name)

is_stopword(form, normalizer=normalize_name)

load_person_names()

load_person_names_mapping(normalizer=noop_normalizer, min_mappings=1)

normalize_name(name, sep=WS) cached

pick_case(names)

pick_name(names)

prenormalize_name(name)

reduce_names(names)

remove_obj_prefixes(name)

remove_org_prefixes(name)

remove_org_types(name, replacement='', normalizer=_normalize_compare)

remove_person_prefixes(name)

replace_org_types_compare(name, normalizer=_normalize_compare, generic=False)

replace_org_types_display(name, normalizer=normalize_display)

tag_org_name(name, normalizer)

tag_person_name(name, normalizer, any_initials=False)

tokenize_name(text, token_min_length=1)

`rigour.names`

`Name`

`comparable` `property`

`norm_form` `property`

`symbols` `property`

`apply_part(part, symbol)`

`apply_phrase(phrase, symbol)`

`contains(other)`

`NamePart`

`can_match(other)`

`tag_sort(parts)` `classmethod`

`NamePartTag`

`can_match(other)`

`NameTypeTag`

`Span`

`comparable` `property`

`len()`

`Symbol`

`init(category, id)`

`align_person_name_order(left, right)`

`extract_org_types(name, normalizer=_normalize_compare, generic=False)`

`is_name(name)`

`is_stopword(form, normalizer=normalize_name)`

`load_person_names()`

`load_person_names_mapping(normalizer=noop_normalizer, min_mappings=1)`

`normalize_name(name, sep=WS)` `cached`

`pick_case(names)`

`pick_name(names)`

`prenormalize_name(name)`

`reduce_names(names)`

`remove_obj_prefixes(name)`

`remove_org_prefixes(name)`

`remove_org_types(name, replacement='', normalizer=_normalize_compare)`

`remove_person_prefixes(name)`

`replace_org_types_compare(name, normalizer=_normalize_compare, generic=False)`

`replace_org_types_display(name, normalizer=normalize_display)`

`tag_org_name(name, normalizer)`

`tag_person_name(name, normalizer, any_initials=False)`

`tokenize_name(text, token_min_length=1)`