medcat.tokenizing.regex_impl.tokenizer

Classes:

Document –
Entity –
RegexTokenizer –
Token –

Document

Document(text: str, tokens: Optional[list[MutableToken]] = None)

Methods:

get_addon_data –
get_available_addon_paths –
get_tokens –
has_addon_data –
isupper –
register_addon_path –
set_addon_data –

Attributes:

base (BaseDocument) –
linked_ents (list[MutableEntity]) –
ner_ents (list[MutableEntity]) –
text –

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def __init__(self, text: str, tokens: Optional[list[MutableToken]] = None
             ) -> None:
    self.text = text
    self._tokens = tokens or []
    self._char_indices: list[int] = []
    self.ner_ents: list[MutableEntity] = []
    self.linked_ents: list[MutableEntity] = []

base `property`

base: BaseDocument

linked_ents `instance-attribute`

linked_ents: list[MutableEntity] = []

ner_ents `instance-attribute`

ner_ents: list[MutableEntity] = []

text `instance-attribute`

text = text

get_addon_data

get_addon_data(path: str) -> Any

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def get_addon_data(self, path: str) -> Any:
    if not hasattr(self.__class__, path):
        raise UnregisteredDataPathException(self.__class__, path)
    return getattr(self, path)

get_available_addon_paths

get_available_addon_paths() -> list[str]

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def get_available_addon_paths(self) -> list[str]:
    return [path for path in self._addon_extension_paths
            if self.get_addon_data(path)]

get_tokens

get_tokens(start_index: int, end_index: int) -> list[MutableToken]

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def get_tokens(self, start_index: int, end_index: int
               ) -> list[MutableToken]:
    if self._char_indices:
        lo = bisect_left(self._char_indices, start_index)
        hi = bisect_right(self._char_indices, end_index)
        return self._tokens[lo:hi]
    return [tkn for tkn in self
            if start_index <= tkn.base.char_index <= end_index]

has_addon_data

has_addon_data(path: str) -> bool

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def has_addon_data(self, path: str) -> bool:
    return bool(self.get_addon_data(path))

isupper

isupper() -> bool

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def isupper(self) -> bool:
    return self.text.isupper()

register_addon_path `classmethod`

register_addon_path(path: str, def_val: Any = None, force: bool = True) -> None

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

@classmethod
def register_addon_path(cls, path: str, def_val: Any = None,
                        force: bool = True) -> None:
    setattr(cls, path, def_val)
    cls._addon_extension_paths.add(path)

set_addon_data

set_addon_data(path: str, val: Any) -> None

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def set_addon_data(self, path: str, val: Any) -> None:
    if not hasattr(self.__class__, path):
        raise UnregisteredDataPathException(self.__class__, path)
    setattr(self, path, val)

Entity

Entity(document: Document, text: str, start_index: int, end_index: int, start_char_index: int, end_char_index: int)

Methods:

get_addon_data –
get_available_addon_paths –
has_addon_data –
register_addon_path –
set_addon_data –

Attributes:

ENTITY_INFO_PREFIX –
base (BaseEntity) –
confidence (float) –
context_similarity (float) –
cui –
detected_name –
end_char_index (int) –
end_index (int) –
id –
label (int) –
link_candidates (list[str]) –
start_char_index (int) –
start_index (int) –
text (str) –

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def __init__(self, document: 'Document',
             text: str, start_index: int, end_index: int,
             start_char_index: int, end_char_index: int) -> None:
    self._doc = document
    self._text = text
    self._start_index = start_index
    self._end_index = end_index
    self._start_char_index = start_char_index
    self._end_char_index = end_char_index
    # defaults
    self.link_candidates: list[str] = []
    self.context_similarity: float = 0.0
    self.confidence: float = 0.0
    self.cui = ''
    self.id = -1  # TODO - what's the default?
    self.detected_name = ''

ENTITY_INFO_PREFIX `class-attribute` `instance-attribute`

ENTITY_INFO_PREFIX = 'Entity:'

base `property`

base: BaseEntity

confidence `instance-attribute`

confidence: float = 0.0

context_similarity `instance-attribute`

context_similarity: float = 0.0

cui `instance-attribute`

cui = ''

detected_name `instance-attribute`

detected_name = ''

end_char_index `property`

end_char_index: int

end_index `property`

end_index: int

id `instance-attribute`

id = -1

label `property`

label: int

link_candidates `instance-attribute`

link_candidates: list[str] = []

start_char_index `property`

start_char_index: int

start_index `property`

start_index: int

text `property`

text: str

get_addon_data

get_addon_data(path: str) -> Any

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def get_addon_data(self, path: str) -> Any:
    # NOTE: doc.get_addon_data will raise if not registered
    doc_dict = self._doc.get_addon_data(f"{self.ENTITY_INFO_PREFIX}{path}")
    return doc_dict[(self.start_index, self.end_index)]

get_available_addon_paths

get_available_addon_paths() -> list[str]

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def get_available_addon_paths(self) -> list[str]:
    return [path for path in self._addon_extension_paths
            if self.get_addon_data(path)]

has_addon_data

has_addon_data(path: str) -> bool

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def has_addon_data(self, path: str) -> bool:
    return bool(self.get_addon_data(path))

register_addon_path `classmethod`

register_addon_path(path: str, def_val: Any = None, force: bool = True) -> None

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

@classmethod
def register_addon_path(cls, path: str, def_val: Any = None,
                        force: bool = True) -> None:
    # NOTE: registering for document since that should be constant
    # whereas the entities may be created and recreated
    # it'll map the entity start and end index to the value
    def_val_doc: dict = defaultdict(lambda: def_val)
    Document.register_addon_path(
        f"{cls.ENTITY_INFO_PREFIX}{path}", def_val=def_val_doc,
        force=force)
    cls._addon_extension_paths.add(path)

set_addon_data

set_addon_data(path: str, val: Any) -> None

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def set_addon_data(self, path: str, val: Any) -> None:
    # NOTE: doc.get_addon_data will raise if not registered
    doc_dict = self._doc.get_addon_data(f"{self.ENTITY_INFO_PREFIX}{path}")
    doc_dict[(self.start_index, self.end_index)] = val

RegexTokenizer

Bases: BaseTokenizer

Methods:

create_entity –
create_new_tokenizer –
entity_from_tokens –
entity_from_tokens_in_doc –
get_doc_class –
get_entity_class –

Attributes:

REGEX –

REGEX `class-attribute` `instance-attribute`

REGEX = compile('(([^a-zA-Z0-9\\s]+|\\b\\w+\\b|\\S+)\\s?)')

create_entity

create_entity(doc: MutableDocument, token_start_index: int, token_end_index: int, label: str) -> MutableEntity

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def create_entity(self, doc: MutableDocument,
                  token_start_index: int, token_end_index: int,
                  label: str) -> MutableEntity:
    rdoc = cast(Document, doc)
    return self.entity_from_tokens(
        # rdoc._tokens[token_start_index: token_end_index + 1])
        rdoc._tokens[token_start_index: token_end_index])

create_new_tokenizer `classmethod`

create_new_tokenizer(config: Config) -> RegexTokenizer

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

@classmethod
def create_new_tokenizer(cls, config: Config) -> 'RegexTokenizer':
    return cls()

entity_from_tokens

entity_from_tokens(tokens: list[MutableToken]) -> MutableEntity

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
    if not tokens:
        raise ValueError("Need at least one token for an entity")
    doc = cast(Token, tokens[0])._doc
    start_index = doc._tokens.index(tokens[0])
    end_index = doc._tokens.index(tokens[-1])
    return _entity_from_tokens(doc, tokens, start_index, end_index)

entity_from_tokens_in_doc

entity_from_tokens_in_doc(tokens: list[MutableToken], doc: MutableDocument) -> MutableEntity

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def entity_from_tokens_in_doc(self, tokens: list[MutableToken],
                              doc: MutableDocument) -> MutableEntity:
    existing_ent = self._get_existing_entity(tokens, doc)
    if existing_ent:
        return existing_ent
    return self.entity_from_tokens(tokens)

get_doc_class

get_doc_class() -> Type[MutableDocument]

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def get_doc_class(self) -> Type[MutableDocument]:
    return Document

get_entity_class

get_entity_class() -> Type[MutableEntity]

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def get_entity_class(self) -> Type[MutableEntity]:
    return Entity

Token

Token(document: Document, text: str, _text_with_ws: str, start_index: int, token_index: int, is_punct: bool, to_skip: bool)

Attributes:

base (BaseToken) –
char_index (int) –
index (int) –
is_digit (bool) –
is_punctuation (bool) –
is_stop (bool) –
is_upper (bool) –
lemma (str) –
lower (str) –
norm (str) –
tag (Optional[str]) –
text (str) –
text_versions (list[str]) –
text_with_ws (str) –
to_skip (bool) –

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py

def __init__(self, document: 'Document',
             text: str, _text_with_ws: str,
             start_index: int, token_index: int,
             is_punct: bool, to_skip: bool) -> None:
    self._doc = document
    self._text = text
    self._text_with_ws = _text_with_ws
    self._start_index = start_index
    self._token_index = token_index
    self._is_punct = is_punct
    self._to_skip = to_skip
    # defaults
    if self.norm is None:
        # force spacy to init ''
        self.norm = ''

base `property`

base: BaseToken

char_index `property`

char_index: int

index `property`

index: int

is_digit `property`

is_digit: bool

is_punctuation `property` `writable`

is_punctuation: bool

is_stop `property`

is_stop: bool

is_upper `property`

is_upper: bool

lemma `property`

lemma: str

lower `property`

lower: str

norm `property` `writable`

norm: str

tag `property`

tag: Optional[str]

text `property`

text: str

text_versions `property`

text_versions: list[str]

text_with_ws `property`

text_with_ws: str

to_skip `property` `writable`

to_skip: bool

medcat.tokenizing.regex_impl.tokenizer

Document

base property

linked_ents instance-attribute

ner_ents instance-attribute

text instance-attribute

get_addon_data

get_available_addon_paths

get_tokens

has_addon_data

isupper

register_addon_path classmethod

set_addon_data

Entity

ENTITY_INFO_PREFIX class-attribute instance-attribute

base property

confidence instance-attribute

context_similarity instance-attribute

cui instance-attribute

detected_name instance-attribute

end_char_index property

end_index property

id instance-attribute

label property

link_candidates instance-attribute

start_char_index property

start_index property

text property

get_addon_data

get_available_addon_paths

has_addon_data

register_addon_path classmethod

set_addon_data

RegexTokenizer

REGEX class-attribute instance-attribute

create_entity

create_new_tokenizer classmethod

entity_from_tokens

entity_from_tokens_in_doc

get_doc_class

get_entity_class

Token

base property

char_index property

index property

is_digit property

is_punctuation property writable

is_stop property

is_upper property

lemma property

lower property

norm property writable

tag property

text property

text_versions property

text_with_ws property

to_skip property writable

base `property`

linked_ents `instance-attribute`

ner_ents `instance-attribute`

text `instance-attribute`

register_addon_path `classmethod`

ENTITY_INFO_PREFIX `class-attribute` `instance-attribute`

base `property`

confidence `instance-attribute`

context_similarity `instance-attribute`

cui `instance-attribute`

detected_name `instance-attribute`

end_char_index `property`

end_index `property`

id `instance-attribute`

label `property`

link_candidates `instance-attribute`

start_char_index `property`

start_index `property`

text `property`

register_addon_path `classmethod`

REGEX `class-attribute` `instance-attribute`

create_new_tokenizer `classmethod`

base `property`

char_index `property`

index `property`

is_digit `property`

is_punctuation `property` `writable`

is_stop `property`

is_upper `property`

lemma `property`

lower `property`

norm `property` `writable`

tag `property`

text `property`

text_versions `property`

text_with_ws `property`

to_skip `property` `writable`