Name handling utilities for person and organisation names. This module contains a large (and growing)
set of tools for handling names. In general, there are three types of names: people, organizations,
and objects. Different normalization may be required for each of these types, including prefix removal
for person names (e.g. "Mr." or "Ms.") and type normalization for organization names (e.g.
"Incorporated" -> "Inc" or "Limited" -> "Ltd").
The Name
class is meant to provide a structure for a name, including its original form, normalized form,
metadata on the type of thing described by the name, and the language of the name. The NamePart
class
is used to represent individual parts of a name, such as the first name, middle name, and last name.
Name
Bases: object
A name of a thing, such as a person, organization or object. Each name consists of a
sequence of parts, each of which has a form and a tag. The form is the text of the part, and the tag
is a label indicating the type of part. For example, in the name "John Smith", "John" is a given name
and "Smith" is a family name. The tag for "John" would be NamePartTag.GIVEN
and the tag for "Smith"
would be NamePartTag.FAMILY
. The form for both parts would be the text of the part itself.
Source code in rigour/names/name.py
| class Name(object):
"""A name of a thing, such as a person, organization or object. Each name consists of a
sequence of parts, each of which has a form and a tag. The form is the text of the part, and the tag
is a label indicating the type of part. For example, in the name "John Smith", "John" is a given name
and "Smith" is a family name. The tag for "John" would be `NamePartTag.GIVEN` and the tag for "Smith"
would be `NamePartTag.FAMILY`. The form for both parts would be the text of the part itself.
"""
__slots__ = ["original", "form", "tag", "lang", "_parts", "spans"]
def __init__(
self,
original: str,
form: Optional[str] = None,
tag: NameTypeTag = NameTypeTag.UNK,
lang: Optional[str] = None,
parts: Optional[List[NamePart]] = None,
):
self.original = original
self.form = form or prenormalize_name(original)
self.tag = tag
self.lang = lang
self._parts = parts
self.spans: List[Span] = []
@property
def parts(self) -> List[NamePart]:
if self._parts is None:
self._parts = []
for i, form in enumerate(tokenize_name(self.form)):
self._parts.append(NamePart(form, i))
return self._parts
@property
def comparable(self) -> str:
"""Return the ASCII representation of the name, if available."""
return " ".join(part.comparable for part in self.parts)
@property
def norm_form(self) -> str:
"""Return the normalized form of the name by joining name parts."""
return " ".join([part.form for part in self.parts])
def tag_text(self, text: str, tag: NamePartTag, max_matches: int = 1) -> None:
tokens = tokenize_name(prenormalize_name(text))
matches = 0
matching: List[NamePart] = []
for part in self.parts:
next_token = tokens[len(matching)]
if part.form == next_token:
matching.append(part)
if len(matching) == len(tokens):
for part in matching:
if part.tag == NamePartTag.ANY:
part.tag = tag
elif not part.tag.can_match(tag):
# if the part is already tagged, we check compatibility and
# possibly revert to the basic type
part.tag = NamePartTag.ANY
matches += 1
if matches >= max_matches:
return
matching = []
def apply_phrase(self, phrase: str, symbol: Symbol) -> None:
"""Apply a symbol to a phrase in the name."""
matching: List[NamePart] = []
tokens = phrase.split(" ")
for part in self.parts:
next_token = tokens[len(matching)]
if part.form == next_token:
matching.append(part)
if len(matching) == len(tokens):
self.spans.append(Span(matching, symbol))
matching = []
def apply_part(self, part: NamePart, symbol: Symbol) -> None:
"""Apply a symbol to a part of the name."""
self.spans.append(Span([part], symbol))
@property
def symbols(self) -> Set[Symbol]:
"""Return a dictionary of symbols applied to the name."""
symbols: Set[Symbol] = set()
for span in self.spans:
symbols.add(span.symbol)
return symbols
def contains(self, other: "Name") -> bool:
"""Check if this name contains another name."""
if self == other or self.tag == NameTypeTag.UNK:
return False
if len(self.parts) < len(other.parts):
return False
if self.tag == NameTypeTag.PER:
forms = [part.comparable for part in self.parts]
other_forms = [part.comparable for part in other.parts]
common_forms = list_intersection(forms, other_forms)
# we want to make this support middle initials so that
# "John Smith" can match "J. Smith"
for ospan in other.spans:
if ospan.symbol.category == Symbol.Category.INITIAL:
if len(ospan.parts[0].form) > 1:
continue
for span in self.spans:
if span.symbol == ospan.symbol:
common_forms.append(ospan.comparable)
# If every part of the other name is represented in the common forms,
# we consider it a match.
if len(common_forms) == len(other_forms):
return True
return other.norm_form in self.norm_form
def __eq__(self, other: Any) -> bool:
try:
return self.form == other.form # type: ignore
except AttributeError:
return False
def __hash__(self) -> int:
return hash(self.form)
def __str__(self) -> str:
return self.original
def __repr__(self) -> str:
return "<Name(%r, %r, %r)>" % (self.original, self.form, self.tag.value)
|
comparable
property
Return the ASCII representation of the name, if available.
Return the normalized form of the name by joining name parts.
symbols
property
Return a dictionary of symbols applied to the name.
apply_part(part, symbol)
Apply a symbol to a part of the name.
Source code in rigour/names/name.py
| def apply_part(self, part: NamePart, symbol: Symbol) -> None:
"""Apply a symbol to a part of the name."""
self.spans.append(Span([part], symbol))
|
apply_phrase(phrase, symbol)
Apply a symbol to a phrase in the name.
Source code in rigour/names/name.py
| def apply_phrase(self, phrase: str, symbol: Symbol) -> None:
"""Apply a symbol to a phrase in the name."""
matching: List[NamePart] = []
tokens = phrase.split(" ")
for part in self.parts:
next_token = tokens[len(matching)]
if part.form == next_token:
matching.append(part)
if len(matching) == len(tokens):
self.spans.append(Span(matching, symbol))
matching = []
|
contains(other)
Check if this name contains another name.
Source code in rigour/names/name.py
| def contains(self, other: "Name") -> bool:
"""Check if this name contains another name."""
if self == other or self.tag == NameTypeTag.UNK:
return False
if len(self.parts) < len(other.parts):
return False
if self.tag == NameTypeTag.PER:
forms = [part.comparable for part in self.parts]
other_forms = [part.comparable for part in other.parts]
common_forms = list_intersection(forms, other_forms)
# we want to make this support middle initials so that
# "John Smith" can match "J. Smith"
for ospan in other.spans:
if ospan.symbol.category == Symbol.Category.INITIAL:
if len(ospan.parts[0].form) > 1:
continue
for span in self.spans:
if span.symbol == ospan.symbol:
common_forms.append(ospan.comparable)
# If every part of the other name is represented in the common forms,
# we consider it a match.
if len(common_forms) == len(other_forms):
return True
return other.norm_form in self.norm_form
|
NamePart
Bases: object
A part of a name, such as a given name or family name. This object is used to compare
and match names. It generates and caches representations of the name in various processing
forms.
Source code in rigour/names/part.py
| class NamePart(object):
"""A part of a name, such as a given name or family name. This object is used to compare
and match names. It generates and caches representations of the name in various processing
forms."""
__slots__ = ["form", "index", "tag", "latinize", "_ascii", "_hash"]
def __init__(
self,
form: str,
index: Optional[int] = None,
tag: NamePartTag = NamePartTag.ANY,
) -> None:
self.form = form
self.index = index
self.tag = tag
self.latinize = can_latinize(form)
self._ascii: Optional[str] = None
self._hash = hash((self.index, self.form))
@property
def ascii(self) -> Optional[str]:
if self._ascii is None:
out = ascii_text(self.form)
self._ascii = "".join(o for o in out if o.isalnum())
return self._ascii if len(self._ascii) > 0 else None
@property
def comparable(self) -> str:
if not self.latinize:
return self.form
ascii = self.ascii
if ascii is None:
return self.form
return ascii
@property
def metaphone(self) -> Optional[str]:
if self.latinize:
ascii_form = self.ascii
if ascii_form is not None and len(ascii_form) > 2:
return metaphone(ascii_form)
return None
def can_match(self, other: "NamePart") -> bool:
"""Check if this part can match another part. This is based on the tags of the parts."""
return self.tag.can_match(other.tag)
def __eq__(self, other: Any) -> bool:
try:
return other._hash == self._hash # type: ignore
except AttributeError:
return False
def __hash__(self) -> int:
return self._hash
def __len__(self) -> int:
return len(self.form)
def __repr__(self) -> str:
return "<NamePart(%r, %s, %r)>" % (self.form, self.index, self.tag.value)
@classmethod
def tag_sort(cls, parts: list["NamePart"]) -> list["NamePart"]:
"""Sort name parts by their index."""
return sorted(parts, key=lambda np: NAME_TAGS_ORDER.index(np.tag))
|
can_match(other)
Check if this part can match another part. This is based on the tags of the parts.
Source code in rigour/names/part.py
| def can_match(self, other: "NamePart") -> bool:
"""Check if this part can match another part. This is based on the tags of the parts."""
return self.tag.can_match(other.tag)
|
tag_sort(parts)
classmethod
Sort name parts by their index.
Source code in rigour/names/part.py
| @classmethod
def tag_sort(cls, parts: list["NamePart"]) -> list["NamePart"]:
"""Sort name parts by their index."""
return sorted(parts, key=lambda np: NAME_TAGS_ORDER.index(np.tag))
|
NamePartTag
Bases: Enum
Within a name, identify name part types.
Source code in rigour/names/tag.py
| class NamePartTag(Enum):
"""Within a name, identify name part types."""
ANY = "ANY"
TITLE = "TITLE"
GIVEN = "GIVEN"
MIDDLE = "MIDDLE"
FAMILY = "FAMILY"
TRIBAL = "TRIBAL"
PATRONYMIC = "PATRONYMIC"
MATRONYMIC = "MATRONYMIC"
HONORIFIC = "HONORIFIC"
SUFFIX = "SUFFIX"
NICK = "NICK"
STOP = "STOP" # Stopword
NUM = "NUM"
LEGAL = "LEGAL" # Legal form of an organisation
def can_match(self, other: "NamePartTag") -> bool:
"""Check if this tag can match the other tag."""
if self == NamePartTag.ANY or other == NamePartTag.ANY:
return True
if self == other:
return True
if self in GIVEN_NAME_TAGS and other not in GIVEN_NAME_TAGS:
return False
if self in FAMILY_NAME_TAGS and other not in FAMILY_NAME_TAGS:
return False
return True
|
can_match(other)
Check if this tag can match the other tag.
Source code in rigour/names/tag.py
| def can_match(self, other: "NamePartTag") -> bool:
"""Check if this tag can match the other tag."""
if self == NamePartTag.ANY or other == NamePartTag.ANY:
return True
if self == other:
return True
if self in GIVEN_NAME_TAGS and other not in GIVEN_NAME_TAGS:
return False
if self in FAMILY_NAME_TAGS and other not in FAMILY_NAME_TAGS:
return False
return True
|
NameTypeTag
Bases: Enum
Metadata on what sort of object is described by a name
Source code in rigour/names/tag.py
| class NameTypeTag(Enum):
"""Metadata on what sort of object is described by a name"""
UNK = "UNK" # Unknown
ENT = "ENT" # Entity
PER = "PER" # Person
ORG = "ORG" # Organization/Company
OBJ = "OBJ" # Object - Vessel, Security, etc.
|
Span
A span is a set of parts of a name that have been tagged with a symbol.
Source code in rigour/names/part.py
| class Span:
"""A span is a set of parts of a name that have been tagged with a symbol."""
__slots__ = ["parts", "symbol"]
def __init__(self, parts: List[NamePart], symbol: Symbol) -> None:
self.parts = tuple(parts)
self.symbol = symbol
@property
def comparable(self) -> str:
"""Return the comparison-suited string representation of the span."""
return " ".join([part.comparable for part in self.parts])
def __len__(self) -> int:
"""Return the number of parts in the span."""
return sum(len(part) for part in self.parts)
def __hash__(self) -> int:
return hash((self.parts, self.symbol))
def __eq__(self, other: Any) -> bool:
return hash(self) == hash(other)
def __repr__(self) -> str:
return f"<Span({self.parts!r}, {self.symbol})>"
|
comparable
property
Return the comparison-suited string representation of the span.
__len__()
Return the number of parts in the span.
Source code in rigour/names/part.py
| def __len__(self) -> int:
"""Return the number of parts in the span."""
return sum(len(part) for part in self.parts)
|
Symbol
A symbol is a semantic interpretation applied to one or more parts of a name. Symbols can
represent various categories such as organization classes, initials, names, numeric, or phonetic
transcriptions. Each symbol has a category and an identifier.
Source code in rigour/names/symbol.py
| class Symbol:
"""A symbol is a semantic interpretation applied to one or more parts of a name. Symbols can
represent various categories such as organization classes, initials, names, numeric, or phonetic
transcriptions. Each symbol has a category and an identifier."""
class Category(Enum):
# ORG_TYPE = "ORGTYPE"
ORG_CLASS = "ORGCLS"
SYMBOL = "SYMBOL"
INITIAL = "INITIAL"
NAME = "NAME"
NUMERIC = "NUM"
LOCATION = "LOC"
PHONETIC = "PHON"
__slots__ = ["category", "id"]
def __init__(self, category: Category, id: Any) -> None:
"""Create a symbol with a category and an id."""
self.category = category
self.id = id
def __hash__(self) -> int:
return hash((self.category, self.id))
def __eq__(self, other: Any) -> bool:
if not isinstance(other, Symbol):
return False
return self.category == other.category and self.id == other.id
def __str__(self) -> str:
return f"[{self.category.value}:{self.id}]"
def __repr__(self) -> str:
return f"<Symbol({self.category}, {self.id})>"
|
__init__(category, id)
Create a symbol with a category and an id.
Source code in rigour/names/symbol.py
| def __init__(self, category: Category, id: Any) -> None:
"""Create a symbol with a category and an id."""
self.category = category
self.id = id
|
align_person_name_order(left, right)
Aligns the name parts of a person name for two names based on their tags and their string
similarity such that the most similar name parts are matched.
Parameters:
Name |
Type |
Description |
Default |
left
|
List[NamePart]
|
The name parts of the first name.
|
required
|
right
|
List[NamePart]
|
The name parts of the second name.
|
required
|
Returns:
Type |
Description |
Tuple[List[NamePart], List[NamePart]]
|
Tuple[List[NamePart], List[NamePart]]: A tuple containing the sorted name parts of both names.
|
Source code in rigour/names/alignment.py
| def align_person_name_order(
left: List[NamePart], right: List[NamePart]
) -> Tuple[List[NamePart], List[NamePart]]:
"""Aligns the name parts of a person name for two names based on their tags and their string
similarity such that the most similar name parts are matched.
Args:
left (List[NamePart]): The name parts of the first name.
right (List[NamePart]): The name parts of the second name.
Returns:
Tuple[List[NamePart], List[NamePart]]: A tuple containing the sorted name parts of both names.
"""
if not len(left):
return (left, NamePart.tag_sort(right))
left_sorted: List[NamePart] = []
right_sorted: List[NamePart] = []
left_unused = sorted(left, key=len, reverse=True)
right_unused = sorted(right, key=len, reverse=True)
while len(left_unused) > 0 and len(right_unused) > 0:
best_score = 0.0
best_left_parts: Optional[List[NamePart]] = None
best_right_parts: Optional[List[NamePart]] = None
for qp, rp in product(left_unused, right_unused):
if not qp.can_match(rp):
continue
if qp.comparable == rp.comparable:
best_score = 1.0
best_left_parts = [qp]
best_right_parts = [rp]
break
# check the Levenshtein distance between the two parts
score = _name_levenshtein([qp], [rp])
if score > best_score:
best_left_parts = [qp]
best_right_parts = [rp]
if len(qp.form) > len(rp.form):
best_right_parts = _pack_short_parts(qp, rp, right_unused)
elif len(rp.form) > len(qp.form):
best_left_parts = _pack_short_parts(rp, qp, left_unused)
best_score = _name_levenshtein(best_left_parts, best_right_parts)
if best_score == 0.0:
# no match found, break out of the loop
break
if best_left_parts is not None:
left_sorted.extend(best_left_parts)
for qp in best_left_parts:
left_unused.remove(qp)
if best_right_parts is not None:
right_sorted.extend(best_right_parts)
for rp in best_right_parts:
right_unused.remove(rp)
if not len(left_sorted):
return (NamePart.tag_sort(left), NamePart.tag_sort(right))
left_sorted.extend(left_unused)
right_sorted.extend(right_unused)
return (left_sorted, right_sorted)
|
Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and
return the extracted type.
This can be used as a very poor man's method to determine if a given string is a company name.
Parameters:
Name |
Type |
Description |
Default |
name
|
str
|
The text to be processed. It is assumed to be already normalized (see below).
|
required
|
normalizer
|
Callable[[str | None], str | None]
|
A text normalization function to run on the
lookup values before matching to remove text anomalies and make matches more likely.
|
_normalize_compare
|
generic
|
bool
|
If True, return the generic form of the organization type (e.g. LLC, JSC) instead
of the type-specific comparison form (GmbH, AB, NV).
|
False
|
Returns:
Type |
Description |
List[Tuple[str, str]]
|
Tuple[str, str]: Tuple of the org type as matched, and the compare form of it.
|
Source code in rigour/names/org_types.py
| def extract_org_types(
name: str, normalizer: Normalizer = _normalize_compare, generic: bool = False
) -> List[Tuple[str, str]]:
"""Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and
return the extracted type.
This can be used as a very poor man's method to determine if a given string is a company name.
Args:
name (str): The text to be processed. It is assumed to be already normalized (see below).
normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
lookup values before matching to remove text anomalies and make matches more likely.
generic (bool): If True, return the generic form of the organization type (e.g. LLC, JSC) instead
of the type-specific comparison form (GmbH, AB, NV).
Returns:
Tuple[str, str]: Tuple of the org type as matched, and the compare form of it.
"""
_func = _generic_replacer if generic else _compare_replacer
replacer = _func(normalizer=normalizer)
matches: List[Tuple[str, str]] = []
for matched in replacer.extract(name):
matches.append((matched, replacer.mapping.get(matched, matched)))
return matches
|
is_name(name)
Check if the given string is a name. The string is considered a name if it contains at least
one character that is a letter (category 'L' in Unicode).
Source code in rigour/names/check.py
| def is_name(name: str) -> bool:
"""Check if the given string is a name. The string is considered a name if it contains at least
one character that is a letter (category 'L' in Unicode)."""
for char in name:
category = unicodedata.category(char)
if category[0] == "L":
return True
return False
|
is_stopword(form, normalizer=normalize_name)
Check if the given form is a stopword. The stopword list is normalized first.
Parameters:
Name |
Type |
Description |
Default |
form
|
str
|
The token to check, must already be normalized.
|
required
|
normalizer
|
Normalizer
|
The normalizer to use for checking stopwords.
|
normalize_name
|
Returns:
Name | Type |
Description |
bool |
bool
|
True if the form is a stopword, False otherwise.
|
Source code in rigour/names/check.py
| def is_stopword(form: str, normalizer: Normalizer = normalize_name) -> bool:
"""Check if the given form is a stopword. The stopword list is normalized first.
Args:
form (str): The token to check, must already be normalized.
normalizer (Normalizer): The normalizer to use for checking stopwords.
Returns:
bool: True if the form is a stopword, False otherwise.
"""
stopwords = _load_stopwords(normalizer)
return form in stopwords
|
load_person_names()
Load the person QID to name mappings from disk. This is a collection
of aliases (in various alphabets) of person name parts mapped to a
Wikidata QID representing that name part.
Returns:
Type |
Description |
None
|
Generator[Tuple[str, List[str]], None, None]: A generator yielding tuples of QID and list of names.
|
Source code in rigour/names/person.py
| def load_person_names() -> Generator[Tuple[str, List[str]], None, None]:
"""Load the person QID to name mappings from disk. This is a collection
of aliases (in various alphabets) of person name parts mapped to a
Wikidata QID representing that name part.
Returns:
Generator[Tuple[str, List[str]], None, None]: A generator yielding tuples of QID and list of names.
"""
with open(NAMES_DATA_PATH, "r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
names_, qid = line.split(" => ")
names = names_.split(", ")
yield qid, names
|
load_person_names_mapping(normalizer=noop_normalizer, min_mappings=1)
Load the person QID to name mappings from disk. This is a collection
of aliases (in various alphabets) of person name parts mapped to a
Wikidata QID representing that name part.
Parameters:
Name |
Type |
Description |
Default |
normalizer
|
Normalizer
|
A function to normalize names. Defaults to noop_normalizer.
|
noop_normalizer
|
Returns:
Type |
Description |
Dict[str, Set[str]]
|
Dict[str, Set[str]]: A dictionary mapping normalized names to sets of QIDs.
|
Source code in rigour/names/person.py
| def load_person_names_mapping(
normalizer: Normalizer = noop_normalizer, min_mappings: int = 1
) -> Dict[str, Set[str]]:
"""Load the person QID to name mappings from disk. This is a collection
of aliases (in various alphabets) of person name parts mapped to a
Wikidata QID representing that name part.
Args:
normalizer (Normalizer, optional): A function to normalize names. Defaults to noop_normalizer.
Returns:
Dict[str, Set[str]]: A dictionary mapping normalized names to sets of QIDs.
"""
names: Dict[str, Set[str]] = {}
for qid, aliases in load_person_names():
forms: Set[str] = set()
for alias in aliases:
norm_alias = normalizer(alias)
if norm_alias is None or not len(norm_alias):
continue
forms.add(norm_alias)
if len(forms) < min_mappings:
continue
for form in forms:
if form not in names:
names[form] = set([qid])
else:
names[form].add(qid)
return names
|
normalize_name(name, sep=WS)
cached
Normalize a name for tokenization and matching.
Source code in rigour/names/tokenize.py
| @lru_cache(maxsize=MEMO_TINY)
def normalize_name(name: Optional[str], sep: str = WS) -> Optional[str]:
"""Normalize a name for tokenization and matching."""
if name is None:
return None
name = prenormalize_name(name)
joined = sep.join(tokenize_name(name))
if len(joined) == 0:
return None
return joined
|
pick_case(names)
Pick the best mix of lower- and uppercase characters from a set of names
that are identical except for case.
Parameters:
Name |
Type |
Description |
Default |
names
|
List[str]
|
A list of identical names in different cases.
|
required
|
Returns:
Type |
Description |
str
|
Optional[str]: The best name for display.
|
Source code in rigour/names/pick.py
| def pick_case(names: List[str]) -> str:
"""Pick the best mix of lower- and uppercase characters from a set of names
that are identical except for case.
Args:
names (List[str]): A list of identical names in different cases.
Returns:
Optional[str]: The best name for display.
"""
if len(names) == 0:
raise ValueError("Cannot pick a name from an empty list.")
if len(names) == 1:
return names[0]
reference = names[0].title()
difference: Dict[str, int] = {n: 0 for n in names}
for i, char in enumerate(reference):
for name in names:
if len(name) <= i:
raise ValueError("Name length mismatch: %r vs %r" % (name, reference))
nchar = name[i]
if nchar != char:
if nchar.lower() != char.lower():
raise ValueError("Names mismatch: %r vs %r" % (name, reference))
difference[name] += 1
return min(difference.items(), key=lambda x: x[1])[0]
|
pick_name(names)
Pick the best name from a list of names. This is meant to pick a centroid
name, with a bias towards names in a latin script.
Parameters:
Name |
Type |
Description |
Default |
names
|
List[str]
|
|
required
|
Returns:
Type |
Description |
Optional[str]
|
Optional[str]: The best name for display.
|
Source code in rigour/names/pick.py
| def pick_name(names: List[str]) -> Optional[str]:
"""Pick the best name from a list of names. This is meant to pick a centroid
name, with a bias towards names in a latin script.
Args:
names (List[str]): A list of names.
Returns:
Optional[str]: The best name for display.
"""
weights: Dict[str, float] = defaultdict(float)
forms: Dict[str, List[str]] = defaultdict(list)
latin_names: List[str] = []
for name in sorted(names):
form = name.strip().lower()
if len(form) == 0:
continue
# even totally non-Latin names have a base weight of 1:
latin_shr = latin_share(name)
if latin_shr > 0.85:
latin_names.append(name)
weight = 1 + latin_shr
weights[form] += weight
forms[form].append(name)
forms[form].append(name.title())
norm = ascii_text(form)
if len(norm) > 2:
weights[norm] += weight
forms[norm].append(name)
if len(latin_names) == 1:
return latin_names[0]
for form in levenshtein_pick(list(weights.keys()), weights):
for surface in levenshtein_pick(forms.get(form, []), {}):
if surface in names:
return surface
return None
|
prenormalize_name(name)
Prepare a name for tokenization and matching.
Source code in rigour/names/tokenize.py
| def prenormalize_name(name: Optional[str]) -> str:
"""Prepare a name for tokenization and matching."""
if name is None:
return ""
# name = unicodedata.normalize("NFC", name)
return name.lower()
|
reduce_names(names)
Select a reduced set of names from a list of names. This is used to
prepare the set of names linked to a person, organization, or other entity
for publication.
Parameters:
Name |
Type |
Description |
Default |
names
|
List[str]
|
|
required
|
Returns:
Type |
Description |
List[str]
|
List[str]: The reduced list of names.
|
Source code in rigour/names/pick.py
| def reduce_names(names: List[str]) -> List[str]:
"""Select a reduced set of names from a list of names. This is used to
prepare the set of names linked to a person, organization, or other entity
for publication.
Args:
names (List[str]): A list of names.
Returns:
List[str]: The reduced list of names.
"""
if len(names) < 2:
return [n for n in names if is_name(n)]
lower: Dict[str, List[str]] = defaultdict(list)
for name in names:
# Filter names that are not valid (e.g. empty or do not contain any letters)
if not is_name(name):
log.warning("Invalid name found: %r", name)
continue
lower[name.lower()].append(name)
reduced: List[str] = []
for group in lower.values():
try:
picked = pick_case(group)
reduced.append(picked)
except (ValueError, IndexError, KeyError) as e:
log.warning("Failed to pick case: %s", e)
# If we cannot pick a case, add all
reduced.extend(group)
return reduced
|
remove_obj_prefixes(name)
Remove prefixes like The, MV, etc.
Source code in rigour/names/prefix.py
| def remove_obj_prefixes(name: str) -> str:
"""Remove prefixes like The, MV, etc."""
return re_prefixes(OBJ_NAME_PREFIXES).sub("", name)
|
remove_org_prefixes(name)
Remove prefixes like Mr., Mrs., etc.
Source code in rigour/names/prefix.py
| def remove_org_prefixes(name: str) -> str:
"""Remove prefixes like Mr., Mrs., etc."""
return re_prefixes(ORG_NAME_PREFIXES).sub("", name)
|
remove_org_types(name, replacement='', normalizer=_normalize_compare)
Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and
replace it with the given fixed string (empty by default, which signals removal).
Parameters:
Name |
Type |
Description |
Default |
name
|
str
|
The text to be processed. It is assumed to be already normalized (see below).
|
required
|
normalizer
|
Callable[[str | None], str | None]
|
A text normalization function to run on the
lookup values before matching to remove text anomalies and make matches more likely.
|
_normalize_compare
|
Returns:
Name | Type |
Description |
str |
str
|
The text with organization types replaced/removed.
|
Source code in rigour/names/org_types.py
| def remove_org_types(
name: str, replacement: str = "", normalizer: Normalizer = _normalize_compare
) -> str:
"""Match any organization type designation (e.g. LLC, Inc, GmbH) in the given entity name and
replace it with the given fixed string (empty by default, which signals removal).
Args:
name (str): The text to be processed. It is assumed to be already normalized (see below).
normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
lookup values before matching to remove text anomalies and make matches more likely.
Returns:
str: The text with organization types replaced/removed.
"""
replacer = _compare_replacer(normalizer=normalizer)
return replacer.remove(name, replacement=replacement)
|
remove_person_prefixes(name)
Remove prefixes like Mr., Mrs., etc.
Source code in rigour/names/prefix.py
| def remove_person_prefixes(name: str) -> str:
"""Remove prefixes like Mr., Mrs., etc."""
return re_prefixes(PERSON_NAME_PREFIXES).sub("", name)
|
replace_org_types_compare(name, normalizer=_normalize_compare, generic=False)
Replace any organization type indicated in the given entity name (often as a prefix or suffix)
with a heavily normalized form label. This will re-write country-specific entity types (eg. GmbH)
into a simplified spelling suitable for comparison using string distance. The resulting text is
meant to be used in comparison processes, but no longer fit for presentation to a user.
Parameters:
Name |
Type |
Description |
Default |
name
|
str
|
The text to be processed. It is assumed to be already normalized (see below).
|
required
|
normalizer
|
Callable[[str | None], str | None]
|
A text normalization function to run on the
lookup values before matching to remove text anomalies and make matches more likely.
|
_normalize_compare
|
generic
|
bool
|
If True, return the generic form of the organization type (e.g. LLC, JSC) instead
of the type-specific comparison form (GmbH, AB, NV).
|
False
|
Returns:
Type |
Description |
str
|
Optional[str]: The text with organization types replaced.
|
Source code in rigour/names/org_types.py
| def replace_org_types_compare(
name: str, normalizer: Normalizer = _normalize_compare, generic: bool = False
) -> str:
"""Replace any organization type indicated in the given entity name (often as a prefix or suffix)
with a heavily normalized form label. This will re-write country-specific entity types (eg. GmbH)
into a simplified spelling suitable for comparison using string distance. The resulting text is
meant to be used in comparison processes, but no longer fit for presentation to a user.
Args:
name (str): The text to be processed. It is assumed to be already normalized (see below).
normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
lookup values before matching to remove text anomalies and make matches more likely.
generic (bool): If True, return the generic form of the organization type (e.g. LLC, JSC) instead
of the type-specific comparison form (GmbH, AB, NV).
Returns:
Optional[str]: The text with organization types replaced.
"""
_func = _generic_replacer if generic else _compare_replacer
replacer = _func(normalizer=normalizer)
return replacer(name) or name
|
replace_org_types_display(name, normalizer=normalize_display)
Replace organization types in the text with their shortened form. This will perform
a display-safe (light) form of normalization, useful for shortening spelt-out legal forms
into common abbreviations (eg. Siemens Aktiengesellschaft -> Siemens AG).
If the result of the replacement yields an empty string, the original text is returned as-is.
Parameters:
Name |
Type |
Description |
Default |
name
|
str
|
The text to be processed. It is assumed to be already normalized (see below).
|
required
|
normalizer
|
Callable[[str | None], str | None]
|
A text normalization function to run on the
lookup values before matching to remove text anomalies and make matches more likely.
|
normalize_display
|
Returns:
Type |
Description |
str
|
Optional[str]: The text with organization types replaced.
|
Source code in rigour/names/org_types.py
| def replace_org_types_display(
name: str, normalizer: Normalizer = normalize_display
) -> str:
"""Replace organization types in the text with their shortened form. This will perform
a display-safe (light) form of normalization, useful for shortening spelt-out legal forms
into common abbreviations (eg. Siemens Aktiengesellschaft -> Siemens AG).
If the result of the replacement yields an empty string, the original text is returned as-is.
Args:
name (str): The text to be processed. It is assumed to be already normalized (see below).
normalizer (Callable[[str | None], str | None]): A text normalization function to run on the
lookup values before matching to remove text anomalies and make matches more likely.
Returns:
Optional[str]: The text with organization types replaced.
"""
is_uppercase = name.isupper()
replacer = _display_replacer(normalizer=normalizer)
out_text = replacer(name)
if out_text is None:
return name
if is_uppercase:
out_text = out_text.upper()
return out_text
|
tag_org_name(name, normalizer)
Tag the name with the organization type and symbol tags.
Source code in rigour/names/tagging.py
| def tag_org_name(name: Name, normalizer: Normalizer) -> Name:
"""Tag the name with the organization type and symbol tags."""
tagger = _get_org_tagger(normalizer)
for phrase, symbol in tagger(name.norm_form):
name.apply_phrase(phrase, symbol)
return _infer_part_tags(name)
|
tag_person_name(name, normalizer, any_initials=False)
Tag a person's name with the person name part and other symbol tags.
Source code in rigour/names/tagging.py
| def tag_person_name(
name: Name, normalizer: Normalizer, any_initials: bool = False
) -> Name:
"""Tag a person's name with the person name part and other symbol tags."""
# tag given name abbreviations. this is meant to handle a case where the person's
# first or middle name is an abbreviation, e.g. "J. Smith" or "John Q. Smith"
for part in name.parts:
if not part.latinize:
continue
sym = Symbol(Symbol.Category.INITIAL, part.comparable[0])
if any_initials and len(part.form) == 1:
name.apply_part(part, sym)
elif part.tag in GIVEN_NAME_TAGS:
name.apply_part(part, sym)
# tag the name with person symbols
tagger = _get_person_tagger(normalizer)
for phrase, symbol in tagger(name.norm_form):
name.apply_phrase(phrase, symbol)
return _infer_part_tags(name)
|
tokenize_name(text, token_min_length=1)
Split a person or entity's name into name parts.
Source code in rigour/names/tokenize.py
| def tokenize_name(text: str, token_min_length: int = 1) -> List[str]:
"""Split a person or entity's name into name parts."""
# FIXME: Do we want to support CJK scripts at some stage?
tokens: List[str] = []
token: List[str] = []
# TODO: Do we want to do some form of unicode normalization here?
# text = unicodedata.normalize("NFC", text)
for char in text:
if char in SKIP_CHARACTERS:
continue
cat = unicodedata.category(char)
chr = TOKEN_SEP_CATEGORIES.get(cat, char)
if chr is None:
continue
if chr == WS:
if len(token) >= token_min_length:
tokens.append("".join(token))
token.clear()
continue
token.append(chr)
if len(token) >= token_min_length:
tokens.append("".join(token))
return tokens
|