Skip to content

medcat.tokenizing.regex_impl.tokenizer

Classes:

Document

Document(text: str, tokens: Optional[list[MutableToken]] = None)

Methods:

Attributes:

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
223
224
225
226
227
228
229
def __init__(self, text: str, tokens: Optional[list[MutableToken]] = None
             ) -> None:
    self.text = text
    self._tokens = tokens or []
    self._char_indices: list[int] = []
    self.ner_ents: list[MutableEntity] = []
    self.linked_ents: list[MutableEntity] = []

base property

linked_ents instance-attribute

linked_ents: list[MutableEntity] = []

ner_ents instance-attribute

ner_ents: list[MutableEntity] = []

text instance-attribute

text = text

get_addon_data

get_addon_data(path: str) -> Any
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
281
282
283
284
def get_addon_data(self, path: str) -> Any:
    if not hasattr(self.__class__, path):
        raise UnregisteredDataPathException(self.__class__, path)
    return getattr(self, path)

get_available_addon_paths

get_available_addon_paths() -> list[str]
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
286
287
288
def get_available_addon_paths(self) -> list[str]:
    return [path for path in self._addon_extension_paths
            if self.get_addon_data(path)]

get_tokens

get_tokens(start_index: int, end_index: int) -> list[MutableToken]
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
258
259
260
261
262
263
264
265
def get_tokens(self, start_index: int, end_index: int
               ) -> list[MutableToken]:
    if self._char_indices:
        lo = bisect_left(self._char_indices, start_index)
        hi = bisect_right(self._char_indices, end_index)
        return self._tokens[lo:hi]
    return [tkn for tkn in self
            if start_index <= tkn.base.char_index <= end_index]

has_addon_data

has_addon_data(path: str) -> bool
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
278
279
def has_addon_data(self, path: str) -> bool:
    return bool(self.get_addon_data(path))

isupper

isupper() -> bool
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
270
271
def isupper(self) -> bool:
    return self.text.isupper()

register_addon_path classmethod

register_addon_path(path: str, def_val: Any = None, force: bool = True) -> None
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
290
291
292
293
294
@classmethod
def register_addon_path(cls, path: str, def_val: Any = None,
                        force: bool = True) -> None:
    setattr(cls, path, def_val)
    cls._addon_extension_paths.add(path)

set_addon_data

set_addon_data(path: str, val: Any) -> None
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
273
274
275
276
def set_addon_data(self, path: str, val: Any) -> None:
    if not hasattr(self.__class__, path):
        raise UnregisteredDataPathException(self.__class__, path)
    setattr(self, path, val)

Entity

Entity(document: Document, text: str, start_index: int, end_index: int, start_char_index: int, end_char_index: int)

Methods:

Attributes:

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def __init__(self, document: 'Document',
             text: str, start_index: int, end_index: int,
             start_char_index: int, end_char_index: int) -> None:
    self._doc = document
    self._text = text
    self._start_index = start_index
    self._end_index = end_index
    self._start_char_index = start_char_index
    self._end_char_index = end_char_index
    # defaults
    self.link_candidates: list[str] = []
    self.context_similarity: float = 0.0
    self.confidence: float = 0.0
    self.cui = ''
    self.id = -1  # TODO - what's the default?
    self.detected_name = ''

ENTITY_INFO_PREFIX class-attribute instance-attribute

ENTITY_INFO_PREFIX = 'Entity:'

base property

base: BaseEntity

confidence instance-attribute

confidence: float = 0.0

context_similarity instance-attribute

context_similarity: float = 0.0

cui instance-attribute

cui = ''

detected_name instance-attribute

detected_name = ''

end_char_index property

end_char_index: int

end_index property

end_index: int

id instance-attribute

id = -1

label property

label: int
link_candidates: list[str] = []

start_char_index property

start_char_index: int

start_index property

start_index: int

text property

text: str

get_addon_data

get_addon_data(path: str) -> Any
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
180
181
182
183
def get_addon_data(self, path: str) -> Any:
    # NOTE: doc.get_addon_data will raise if not registered
    doc_dict = self._doc.get_addon_data(f"{self.ENTITY_INFO_PREFIX}{path}")
    return doc_dict[(self.start_index, self.end_index)]

get_available_addon_paths

get_available_addon_paths() -> list[str]
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
185
186
187
def get_available_addon_paths(self) -> list[str]:
    return [path for path in self._addon_extension_paths
            if self.get_addon_data(path)]

has_addon_data

has_addon_data(path: str) -> bool
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
177
178
def has_addon_data(self, path: str) -> bool:
    return bool(self.get_addon_data(path))

register_addon_path classmethod

register_addon_path(path: str, def_val: Any = None, force: bool = True) -> None
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
189
190
191
192
193
194
195
196
197
198
199
@classmethod
def register_addon_path(cls, path: str, def_val: Any = None,
                        force: bool = True) -> None:
    # NOTE: registering for document since that should be constant
    # whereas the entities may be created and recreated
    # it'll map the entity start and end index to the value
    def_val_doc: dict = defaultdict(lambda: def_val)
    Document.register_addon_path(
        f"{cls.ENTITY_INFO_PREFIX}{path}", def_val=def_val_doc,
        force=force)
    cls._addon_extension_paths.add(path)

set_addon_data

set_addon_data(path: str, val: Any) -> None
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
172
173
174
175
def set_addon_data(self, path: str, val: Any) -> None:
    # NOTE: doc.get_addon_data will raise if not registered
    doc_dict = self._doc.get_addon_data(f"{self.ENTITY_INFO_PREFIX}{path}")
    doc_dict[(self.start_index, self.end_index)] = val

RegexTokenizer

Bases: BaseTokenizer

Methods:

Attributes:

REGEX class-attribute instance-attribute

REGEX = compile('(([^a-zA-Z0-9\\s]+|\\b\\w+\\b|\\S+)\\s?)')

create_entity

create_entity(doc: MutableDocument, token_start_index: int, token_end_index: int, label: str) -> MutableEntity
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
333
334
335
336
337
338
339
def create_entity(self, doc: MutableDocument,
                  token_start_index: int, token_end_index: int,
                  label: str) -> MutableEntity:
    rdoc = cast(Document, doc)
    return self.entity_from_tokens(
        # rdoc._tokens[token_start_index: token_end_index + 1])
        rdoc._tokens[token_start_index: token_end_index])

create_new_tokenizer classmethod

create_new_tokenizer(config: Config) -> RegexTokenizer
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
386
387
388
@classmethod
def create_new_tokenizer(cls, config: Config) -> 'RegexTokenizer':
    return cls()

entity_from_tokens

entity_from_tokens(tokens: list[MutableToken]) -> MutableEntity
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
344
345
346
347
348
349
350
def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
    if not tokens:
        raise ValueError("Need at least one token for an entity")
    doc = cast(Token, tokens[0])._doc
    start_index = doc._tokens.index(tokens[0])
    end_index = doc._tokens.index(tokens[-1])
    return _entity_from_tokens(doc, tokens, start_index, end_index)

entity_from_tokens_in_doc

entity_from_tokens_in_doc(tokens: list[MutableToken], doc: MutableDocument) -> MutableEntity
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
362
363
364
365
366
367
def entity_from_tokens_in_doc(self, tokens: list[MutableToken],
                              doc: MutableDocument) -> MutableEntity:
    existing_ent = self._get_existing_entity(tokens, doc)
    if existing_ent:
        return existing_ent
    return self.entity_from_tokens(tokens)

get_doc_class

get_doc_class() -> Type[MutableDocument]
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
390
391
def get_doc_class(self) -> Type[MutableDocument]:
    return Document

get_entity_class

get_entity_class() -> Type[MutableEntity]
Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
393
394
def get_entity_class(self) -> Type[MutableEntity]:
    return Entity

Token

Token(document: Document, text: str, _text_with_ws: str, start_index: int, token_index: int, is_punct: bool, to_skip: bool)

Attributes:

Source code in medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def __init__(self, document: 'Document',
             text: str, _text_with_ws: str,
             start_index: int, token_index: int,
             is_punct: bool, to_skip: bool) -> None:
    self._doc = document
    self._text = text
    self._text_with_ws = _text_with_ws
    self._start_index = start_index
    self._token_index = token_index
    self._is_punct = is_punct
    self._to_skip = to_skip
    # defaults
    if self.norm is None:
        # force spacy to init ''
        self.norm = ''

base property

base: BaseToken

char_index property

char_index: int

index property

index: int

is_digit property

is_digit: bool

is_punctuation property writable

is_punctuation: bool

is_stop property

is_stop: bool

is_upper property

is_upper: bool

lemma property

lemma: str

lower property

lower: str

norm property writable

norm: str

tag property

tag: Optional[str]

text property

text: str

text_versions property

text_versions: list[str]

text_with_ws property

text_with_ws: str

to_skip property writable

to_skip: bool