Skip to content

medcat.pipeline.pipeline

Classes:

Attributes:

logger module-attribute

logger = getLogger(__name__)

DelegatingTokenizer

DelegatingTokenizer(tokenizer: BaseTokenizer, components: list[CoreComponent])

Bases: BaseTokenizer

A delegating tokenizer.

This can be used to create a tokenizer with some preprocessing (i.e components) included.

Methods:

Attributes:

Source code in medcat-v2/medcat/pipeline/pipeline.py
35
36
37
38
def __init__(self, tokenizer: BaseTokenizer,
             components: list[CoreComponent]):
    self.tokenizer = tokenizer
    self.components = components

components instance-attribute

components = components

tokenizer instance-attribute

tokenizer = tokenizer

create_entity

create_entity(doc: MutableDocument, token_start_index: int, token_end_index: int, label: str) -> MutableEntity
Source code in medcat-v2/medcat/pipeline/pipeline.py
40
41
42
43
44
def create_entity(self, doc: MutableDocument,
                  token_start_index: int, token_end_index: int,
                  label: str) -> MutableEntity:
    return self.tokenizer.create_entity(
        doc, token_start_index, token_end_index, label)

create_new_tokenizer classmethod

create_new_tokenizer(config: Config) -> DelegatingTokenizer
Source code in medcat-v2/medcat/pipeline/pipeline.py
59
60
61
@classmethod
def create_new_tokenizer(cls, config: Config) -> 'DelegatingTokenizer':
    raise ValueError("Initialise the delegating tokenizer with its initialiser")

entity_from_tokens

entity_from_tokens(tokens: list[MutableToken]) -> MutableEntity
Source code in medcat-v2/medcat/pipeline/pipeline.py
46
47
def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
    return self.tokenizer.entity_from_tokens(tokens)

entity_from_tokens_in_doc

entity_from_tokens_in_doc(tokens: list[MutableToken], doc: MutableDocument) -> MutableEntity
Source code in medcat-v2/medcat/pipeline/pipeline.py
49
50
51
def entity_from_tokens_in_doc(
        self, tokens: list[MutableToken], doc: MutableDocument) -> MutableEntity:
    return self.tokenizer.entity_from_tokens_in_doc(tokens, doc)

get_doc_class

get_doc_class() -> type[MutableDocument]
Source code in medcat-v2/medcat/pipeline/pipeline.py
63
64
def get_doc_class(self) -> type[MutableDocument]:
    return self.tokenizer.get_doc_class()

get_entity_class

get_entity_class() -> type[MutableEntity]
Source code in medcat-v2/medcat/pipeline/pipeline.py
66
67
def get_entity_class(self) -> type[MutableEntity]:
    return self.tokenizer.get_entity_class()

IncorrectAddonLoaded

IncorrectAddonLoaded(*args)

Bases: ValueError

Source code in medcat-v2/medcat/pipeline/pipeline.py
471
472
def __init__(self, *args):
    super().__init__(*args)

IncorrectArgumentsForComponent

IncorrectArgumentsForComponent(comp_type: CoreComponentType, comp_name: str)

Bases: TypeError

Source code in medcat-v2/medcat/pipeline/pipeline.py
442
443
444
445
def __init__(self, comp_type: CoreComponentType, comp_name: str):
    super().__init__(
        f"Incorrect arguments for core component {comp_type.name} "
        f"({comp_name}).")

IncorrectArgumentsForTokenizer

IncorrectArgumentsForTokenizer(provider: str)

Bases: TypeError

Source code in medcat-v2/medcat/pipeline/pipeline.py
435
436
437
def __init__(self, provider: str):
    super().__init__(
        f"Incorrect arguments for tokenizer ({provider}).")

IncorrectCoreComponent

IncorrectCoreComponent(*args)

Bases: ValueError

Source code in medcat-v2/medcat/pipeline/pipeline.py
450
451
def __init__(self, *args):
    super().__init__(*args)

IncorrectFolderUponLoad

IncorrectFolderUponLoad(*args)

Bases: ValueError

Source code in medcat-v2/medcat/pipeline/pipeline.py
456
457
def __init__(self, *args):
    super().__init__(*args)

Pipeline

Pipeline(cdb: CDB, vocab: Optional[Vocab], model_load_path: Optional[str], old_pipe: Optional[Pipeline] = None, addon_config_dict: Optional[dict[str, dict]] = None)

The pipeline for the NLP process.

This class is responsible to initial creation of the NLP document, as well as running through of all the components and addons.

Methods:

Attributes:

Source code in medcat-v2/medcat/pipeline/pipeline.py
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def __init__(self, cdb: CDB, vocab: Optional[Vocab],
             model_load_path: Optional[str],
             # NOTE: upon reload, old pipe can be useful
             old_pipe: Optional['Pipeline'] = None,
             addon_config_dict: Optional[dict[str, dict]] = None):
    self.cdb = cdb
    # NOTE: Vocab is None in case of DeID models and thats fine then,
    #       but it should be non-None otherwise
    self.vocab: Vocab = vocab  # type: ignore
    self.config = self.cdb.config
    self._tokenizer = self._init_tokenizer(model_load_path)
    self._components: list[CoreComponent] = []
    self._addons: list[AddonComponent] = []
    self._init_components(model_load_path, old_pipe, addon_config_dict)

cdb instance-attribute

cdb = cdb

config instance-attribute

config = config

tokenizer property

tokenizer: BaseTokenizer

The raw tokenizer (with no components).

tokenizer_with_tag property

tokenizer_with_tag: BaseTokenizer

The tokenizer with the tagging component.

vocab instance-attribute

vocab: Vocab = vocab

add_addon

add_addon(addon: AddonComponent) -> None
Source code in medcat-v2/medcat/pipeline/pipeline.py
394
395
396
397
def add_addon(self, addon: AddonComponent) -> None:
    self._addons.append(addon)
    # mark clean as of adding
    addon.config.mark_clean()

entity_from_tokens

entity_from_tokens(tokens: list[MutableToken]) -> MutableEntity

Get the entity from the list of tokens.

This effectively turns a list of (consecutive) documents into an entity.

Parameters:

Returns:

Source code in medcat-v2/medcat/pipeline/pipeline.py
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
    """Get the entity from the list of tokens.

    This effectively turns a list of (consecutive) documents
    into an entity.

    Args:
        tokens (list[MutableToken]): The tokens to use.

    Returns:
        MutableEntity: The resulting entity.
    """
    warnings.warn(
        "The `medcat.pipeline.pipeline.Pipeline.entity_from_tokens` method is"
        "depreacated is subject to removal in a future release. Please use "
        "`medcat.pipeline.pipeline.Pipeline.entity_from_tokens_in_doc` "
        "instead.",
        DeprecationWarning,
        stacklevel=2
    )
    return self._tokenizer.entity_from_tokens(tokens)

entity_from_tokens_in_doc

entity_from_tokens_in_doc(tokens: list[MutableToken], doc: MutableDocument) -> MutableEntity

Get the entity from the list of tokens in a document.

This effectively turns a list of (consecutive) documents into an entity. But it is also designed to reuse existing instances on the document instead of creating new ones.

Parameters:

Returns:

Source code in medcat-v2/medcat/pipeline/pipeline.py
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
def entity_from_tokens_in_doc(self, tokens: list[MutableToken],
                              doc: MutableDocument) -> MutableEntity:
    """Get the entity from the list of tokens in a document.

    This effectively turns a list of (consecutive) documents
    into an entity. But it is also designed to reuse existing
    instances on the document instead of creating new ones.

    Args:
        tokens (list[MutableToken]): The tokens to use.
        doc (MutableDocument): The document for these tokens.

    Returns:
        MutableEntity: The resulting entity.
    """
    return self._tokenizer.entity_from_tokens_in_doc(tokens, doc)

get_component

Get the core component by the component type.

Parameters:

Raises:

  • ValueError

    If no component by that type is found.

Returns:

  • CoreComponent ( CoreComponent ) –

    The corresponding core component.

Source code in medcat-v2/medcat/pipeline/pipeline.py
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
def get_component(self, ctype: CoreComponentType) -> CoreComponent:
    """Get the core component by the component type.

    Args:
        ctype (CoreComponentType): The core component type.

    Raises:
        ValueError: If no component by that type is found.

    Returns:
        CoreComponent: The corresponding core component.
    """
    for comp in self._components:
        if not comp.is_core() or not isinstance(comp, CoreComponent):
            continue
        if comp.get_type() is ctype:
            return comp
    raise ValueError(f"No component found of type {ctype}")

get_doc

get_doc(text: str) -> MutableDocument

Get the document for this text.

This essentially runs the tokenizer over the text.

Parameters:

  • text

    (str) –

    The input text.

Returns:

Source code in medcat-v2/medcat/pipeline/pipeline.py
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
def get_doc(self, text: str) -> MutableDocument:
    """Get the document for this text.

    This essentially runs the tokenizer over the text.

    Args:
        text (str): The input text.

    Returns:
        MutableDocument: The resulting document.
    """
    doc = self._tokenizer(text)
    for comp in self._components:
        logger.info("Running component %s for %d of text (%s)",
                    comp.full_name, len(text), id(text))
        doc = comp(doc)
    for addon in self._addons:
        doc = addon(doc)
    return doc

iter_addons

iter_addons() -> Iterable[AddonComponent]
Source code in medcat-v2/medcat/pipeline/pipeline.py
429
430
def iter_addons(self) -> Iterable[AddonComponent]:
    yield from self._addons

iter_all_components

iter_all_components() -> Iterable[BaseComponent]
Source code in medcat-v2/medcat/pipeline/pipeline.py
423
424
425
426
427
def iter_all_components(self) -> Iterable[BaseComponent]:
    for component in self._components:
        yield component
    for addon in self._addons:
        yield addon

save_components

save_components(serialiser_type: Union[AvailableSerialisers, str], components_folder: str) -> None
Source code in medcat-v2/medcat/pipeline/pipeline.py
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
def save_components(self,
                    serialiser_type: Union[AvailableSerialisers, str],
                    components_folder: str) -> None:
    for component in self.iter_all_components():
        if not isinstance(component, Serialisable):
            continue
        if not os.path.exists(components_folder):
            os.mkdir(components_folder)
        if isinstance(component, CoreComponent):
            comp_folder = os.path.join(
                components_folder,
                AbstractCoreComponent.NAME_PREFIX +
                component.get_type().name)
        elif isinstance(component, AddonComponent):
            comp_folder = os.path.join(
                components_folder,
                f"{AddonComponent.NAME_PREFIX}{component.addon_type}"
                f"{AddonComponent.NAME_SPLITTER}{component.name}")
        else:
            raise ValueError(
                f"Unknown component: {type(component)} - does not appear "
                "to be a CoreComponent or an AddonComponent")
        serialise(serialiser_type, component, comp_folder)

UnkownAddonConfig

UnkownAddonConfig(cnf: ComponentConfig, *existing_types: type[ComponentConfig])

Bases: ValueError

Source code in medcat-v2/medcat/pipeline/pipeline.py
462
463
464
465
466
def __init__(self, cnf: ComponentConfig,
             *existing_types: type[ComponentConfig]):
    super().__init__(
        f"Found unknown Addon config of type {type(cnf)}. "
        f"Existing types: {[etype.__name__ for etype in existing_types]}")