How to create and use a (core) component with medcat v2¶

The overall process is quite simple:

Implement and extend CoreComponent
Specify the CoreComponentType
Register the component

Creating component¶

In [ ]:

Copied!





# for init args
from typing import Optional, Any

from medcat.tokenizing.tokenizers import BaseTokenizer
from medcat.vocab import Vocab
from medcat.cdb.cdb import CDB
from medcat.config.config import Ner
# for the component itself
from medcat.components.types import CoreComponentType
from medcat.components.types import AbstractEntityProvidingComponent
from medcat.tokenizing.tokens import MutableDocument, MutableEntity
from medcat.components.ner.vocab_based_annotator import maybe_annotate_name

# for the randomness
import random


class RNG:

    def __init__(self, min: int, mean: int, max: int, std: float):
        self.min = min
        self.mean = mean
        self.max = max
        self.std = std

    def get(self):
        num = int(random.normalvariate(self.mean, self.std))
        return min(max(self.min, num), self.max)


class RandomNER(AbstractEntityProvidingComponent):
    # NOTE: NEED TO IMPLEMENT
    name = "RANDOM_NER"

    # NOTE: NEED TO IMPLEMENT 
    @classmethod
    def create_new_component(
            cls, cnf: Ner, tokenizer: BaseTokenizer, cdb: CDB, vocab: Vocab,
            model_load_path: Optional[str]) -> 'RandomNER':
        return cls(tokenizer, cdb)

    # NOTE: NEED TO IMPLEMENT
    # you can specify whatever init args as long as you define them above
    def __init__(self, tokenizer: BaseTokenizer, cdb: CDB):
        super().__init__()
        self.tokenizer = tokenizer
        self.cdb = cdb

        # this is just for the randomness to kind of make sense
        # i.e create an entity for every 10 tokens
        self.tkns_per_entity = 10
        # random number generator for length of the entity (in tokens)
        self.rng_len = RNG(
            min=1, mean=4, max=8, std=2
        )

    # NOTE: NEED TO IMPLEMENT
    # the type of core component
    def get_type(self) -> CoreComponentType:
        return CoreComponentType.ner

    # NOTE: NEED TO IMPLEMENT
    def predict_entities(self, doc: MutableDocument,
                         ents: list[MutableEntity] | None = None
                         ) -> list[MutableEntity]:
        """Detect candidates for concepts - linker will then be able
        to do the rest. It adds `entities` to the doc.entities and each
        entity can have the entity.link_candidates - that the linker
        will resolve.

        Args:
            doc (MutableDocument):
                Spacy document to be annotated with named entities.
            ents list[MutableEntity] | None = None:
                The entties to use. None expected here.

        Returns:
            doc (MutableDocument):
                Spacy document with detected entities.
        """
        num_tokens = len(list(doc))
        num_ents = num_tokens // self.tkns_per_entity

        start_tkn_indices = sorted([random.randint(0, num_tokens - 3)
                                    for _ in range(num_ents)])
        end_tkn_indices = [min(start + self.rng_len.get(), num_tokens - 2)
                           for start in start_tkn_indices]
        choose_from = list(self.cdb.name2info.keys())
        chosen_name = [random.choice(choose_from) for _ in start_tkn_indices]
        ner_ents: list[MutableEntity] = []
        for tkn_start_idx, tkn_end_idx, linked_name in zip(start_tkn_indices, end_tkn_indices, chosen_name):
            char_start_idx = doc[tkn_start_idx].base.char_index
            # NOTE: can only do this since we're never selecting the last token
            char_end_idx = doc[tkn_end_idx + 1].base.char_index
            cur_tokens = doc.get_tokens(char_start_idx, char_end_idx)
            # NOTE: the get_tokens method will only return a MutableEntity if it's been set,
            #       but nothing should be set before the NER component runs, so it should be
            #       safe to assume that these are all lists of tokens

            # this checks the config (i.e length and stuff) and then annotes
            ent = maybe_annotate_name(self.tokenizer, linked_name, cur_tokens, doc, self.cdb, self.cdb.config, len(ner_ents))
            if ent:
                ner_ents.append(ent)
        return ner_ents
# for init args
from typing import Optional, Any

from medcat.tokenizing.tokenizers import BaseTokenizer
from medcat.vocab import Vocab
from medcat.cdb.cdb import CDB
from medcat.config.config import Ner
# for the component itself
from medcat.components.types import CoreComponentType
from medcat.components.types import AbstractEntityProvidingComponent
from medcat.tokenizing.tokens import MutableDocument, MutableEntity
from medcat.components.ner.vocab_based_annotator import maybe_annotate_name

# for the randomness
import random


class RNG:

    def __init__(self, min: int, mean: int, max: int, std: float):
        self.min = min
        self.mean = mean
        self.max = max
        self.std = std

    def get(self):
        num = int(random.normalvariate(self.mean, self.std))
        return min(max(self.min, num), self.max)


class RandomNER(AbstractEntityProvidingComponent):
    # NOTE: NEED TO IMPLEMENT
    name = "RANDOM_NER"

    # NOTE: NEED TO IMPLEMENT 
    @classmethod
    def create_new_component(
            cls, cnf: Ner, tokenizer: BaseTokenizer, cdb: CDB, vocab: Vocab,
            model_load_path: Optional[str]) -> 'RandomNER':
        return cls(tokenizer, cdb)

    # NOTE: NEED TO IMPLEMENT
    # you can specify whatever init args as long as you define them above
    def __init__(self, tokenizer: BaseTokenizer, cdb: CDB):
        super().__init__()
        self.tokenizer = tokenizer
        self.cdb = cdb

        # this is just for the randomness to kind of make sense
        # i.e create an entity for every 10 tokens
        self.tkns_per_entity = 10
        # random number generator for length of the entity (in tokens)
        self.rng_len = RNG(
            min=1, mean=4, max=8, std=2
        )

    # NOTE: NEED TO IMPLEMENT
    # the type of core component
    def get_type(self) -> CoreComponentType:
        return CoreComponentType.ner

    # NOTE: NEED TO IMPLEMENT
    def predict_entities(self, doc: MutableDocument,
                         ents: list[MutableEntity] | None = None
                         ) -> list[MutableEntity]:
        """Detect candidates for concepts - linker will then be able
        to do the rest. It adds `entities` to the doc.entities and each
        entity can have the entity.link_candidates - that the linker
        will resolve.

        Args:
            doc (MutableDocument):
                Spacy document to be annotated with named entities.
            ents list[MutableEntity] | None = None:
                The entties to use. None expected here.

        Returns:
            doc (MutableDocument):
                Spacy document with detected entities.
        """
        num_tokens = len(list(doc))
        num_ents = num_tokens // self.tkns_per_entity

        start_tkn_indices = sorted([random.randint(0, num_tokens - 3)
                                    for _ in range(num_ents)])
        end_tkn_indices = [min(start + self.rng_len.get(), num_tokens - 2)
                           for start in start_tkn_indices]
        choose_from = list(self.cdb.name2info.keys())
        chosen_name = [random.choice(choose_from) for _ in start_tkn_indices]
        ner_ents: list[MutableEntity] = []
        for tkn_start_idx, tkn_end_idx, linked_name in zip(start_tkn_indices, end_tkn_indices, chosen_name):
            char_start_idx = doc[tkn_start_idx].base.char_index
            # NOTE: can only do this since we're never selecting the last token
            char_end_idx = doc[tkn_end_idx + 1].base.char_index
            cur_tokens = doc.get_tokens(char_start_idx, char_end_idx)
            # NOTE: the get_tokens method will only return a MutableEntity if it's been set,
            #       but nothing should be set before the NER component runs, so it should be
            #       safe to assume that these are all lists of tokens

            # this checks the config (i.e length and stuff) and then annotes
            ent = maybe_annotate_name(self.tokenizer, linked_name, cur_tokens, doc, self.cdb, self.cdb.config, len(ner_ents))
            if ent:
                ner_ents.append(ent)
        return ner_ents

Registering the component¶

In [ ]:

Copied!





from medcat.components.types import register_core_component
# NOTE: in MedCAT v2.5 and onwards, you can import and use `lazy_register_core_component` 
#       to do lazy registartion instead
register_core_component(CoreComponentType.ner, RandomNER.name, RandomNER.create_new_component)
from medcat.components.types import register_core_component
# NOTE: in MedCAT v2.5 and onwards, you can import and use `lazy_register_core_component` 
#       to do lazy registartion instead
register_core_component(CoreComponentType.ner, RandomNER.name, RandomNER.create_new_component)

Using custom component¶

In [ ]:

Copied!





from medcat.config.config import Config
from medcat.preprocessors.cleaners import NameDescriptor
from medcat.cat import CAT
import numpy as np

from pprint import pprint

# start with a config
cnf = Config()
# NOTE: the default is to use regex tokenizer
# set the new/registered component
cnf.components.ner.comp_name = RandomNER.name

# creating an empty Vocab - we don't really need it for this demonstration
vocab = Vocab()
print("Vocab (should be empty):", vocab.vocab)

# create a (random!) CDB
rndom_concepts = {
    "C01": (
        {"CONCEPT1": NameDescriptor(
            tokens=["CONCEPT", "1"],
            snames=["concept", "1"],
            raw_name="CONCEPT1",
            is_upper=True)
        }, {"ONT1",}, "Concept 1 description"
    ),
    "C02": (
        {"CONCEPT2": NameDescriptor(
            tokens=["CONCEPT", "2"],
            snames=["concept", "2"],
            raw_name="CONCEPT2",
            is_upper=True)
        }, {"ONT1",}, "Concept 2 description"
    )
}
cdb = CDB(cnf)
for cui, (names, ontologies, descr) in rndom_concepts.items():
    cdb._add_concept(cui=cui, names=names, ontologies=ontologies,
                     name_status='P', type_ids=['T1'],
                     description=descr, full_build=True)
print("CDB cui2info", cdb.cui2info)
print("CDB name2info", cdb.cui2info)

# creeate CAT
cat = CAT(cdb, vocab)
print("Got CAT")
print("Verifying the type of component we're using")
print("NER:", cat._pipeline.get_component(CoreComponentType.ner))

text = """Some friends are concept1, but most foes is concept2.
We can have a much longer conversation about concepts1 and concepts2 which is
not going to get us anywhere we are not already.
This is just filler stuff for the concepts so that we can "detect" multiple ones.
"""

ents = cat.get_entities(text)
print("ENTITIES")
for ent in ents["entities"].values():
    pprint(ent)
from medcat.config.config import Config
from medcat.preprocessors.cleaners import NameDescriptor
from medcat.cat import CAT
import numpy as np

from pprint import pprint

# start with a config
cnf = Config()
# NOTE: the default is to use regex tokenizer
# set the new/registered component
cnf.components.ner.comp_name = RandomNER.name

# creating an empty Vocab - we don't really need it for this demonstration
vocab = Vocab()
print("Vocab (should be empty):", vocab.vocab)

# create a (random!) CDB
rndom_concepts = {
    "C01": (
        {"CONCEPT1": NameDescriptor(
            tokens=["CONCEPT", "1"],
            snames=["concept", "1"],
            raw_name="CONCEPT1",
            is_upper=True)
        }, {"ONT1",}, "Concept 1 description"
    ),
    "C02": (
        {"CONCEPT2": NameDescriptor(
            tokens=["CONCEPT", "2"],
            snames=["concept", "2"],
            raw_name="CONCEPT2",
            is_upper=True)
        }, {"ONT1",}, "Concept 2 description"
    )
}
cdb = CDB(cnf)
for cui, (names, ontologies, descr) in rndom_concepts.items():
    cdb._add_concept(cui=cui, names=names, ontologies=ontologies,
                     name_status='P', type_ids=['T1'],
                     description=descr, full_build=True)
print("CDB cui2info", cdb.cui2info)
print("CDB name2info", cdb.cui2info)

# creeate CAT
cat = CAT(cdb, vocab)
print("Got CAT")
print("Verifying the type of component we're using")
print("NER:", cat._pipeline.get_component(CoreComponentType.ner))

text = """Some friends are concept1, but most foes is concept2.
We can have a much longer conversation about concepts1 and concepts2 which is
not going to get us anywhere we are not already.
This is just filler stuff for the concepts so that we can "detect" multiple ones.
"""

ents = cat.get_entities(text)
print("ENTITIES")
for ent in ents["entities"].values():
    pprint(ent)

Training was enabled during inference. It was automatically disabled.

Vocab (should be empty): {}
CDB cui2info {'C01': CUIInfo(cui='C01', preferred_name='CONCEPT1', names={'CONCEPT1'}, subnames={'1', 'concept'}, type_ids=['T1'], description='Concept 1 description', original_names={'CONCEPT1'}, tags=[], group=None, in_other_ontology={'ontologies': {'ONT1'}}, count_train=0, context_vectors=None, average_confidence=0.0), 'C02': CUIInfo(cui='C02', preferred_name='CONCEPT2', names={'CONCEPT2'}, subnames={'concept', '2'}, type_ids=['T1'], description='Concept 2 description', original_names={'CONCEPT2'}, tags=[], group=None, in_other_ontology={'ontologies': {'ONT1'}}, count_train=0, context_vectors=None, average_confidence=0.0)}
CDB name2info {'C01': CUIInfo(cui='C01', preferred_name='CONCEPT1', names={'CONCEPT1'}, subnames={'1', 'concept'}, type_ids=['T1'], description='Concept 1 description', original_names={'CONCEPT1'}, tags=[], group=None, in_other_ontology={'ontologies': {'ONT1'}}, count_train=0, context_vectors=None, average_confidence=0.0), 'C02': CUIInfo(cui='C02', preferred_name='CONCEPT2', names={'CONCEPT2'}, subnames={'concept', '2'}, type_ids=['T1'], description='Concept 2 description', original_names={'CONCEPT2'}, tags=[], group=None, in_other_ontology={'ontologies': {'ONT1'}}, count_train=0, context_vectors=None, average_confidence=0.0)}
Got CAT
Verifying the type of component we're using
NER: <__main__.RandomNER object at 0x10a47f670>
ENTITIES
{'acc': 1,
 'context_center': [],
 'context_left': [],
 'context_right': [],
 'context_similarity': 1,
 'cui': 'C01',
 'detected_name': 'CONCEPT1',
 'end': 92,
 'id': 0,
 'meta_anns': {},
 'pretty_name': 'CONCEPT1',
 'source_value': 'concept2.\nWe can have a much longer conversation',
 'start': 44,
 'type_ids': ['T1']}
{'acc': 1,
 'context_center': [],
 'context_left': [],
 'context_right': [],
 'context_similarity': 1,
 'cui': 'C02',
 'detected_name': 'CONCEPT2',
 'end': 231,
 'id': 2,
 'meta_anns': {},
 'pretty_name': 'CONCEPT2',
 'source_value': 'just filler stuff for the concepts so that',
 'start': 189,
 'type_ids': ['T1']}
{'acc': 1,
 'context_center': [],
 'context_left': [],
 'context_right': [],
 'context_similarity': 1,
 'cui': 'C01',
 'detected_name': 'CONCEPT1',
 'end': 151,
 'id': 1,
 'meta_anns': {},
 'pretty_name': 'CONCEPT1',
 'source_value': 'going to get us',
 'start': 136,
 'type_ids': ['T1']}
{'acc': 1,
 'context_center': [],
 'context_left': [],
 'context_right': [],
 'context_similarity': 1,
 'cui': 'C01',
 'detected_name': 'CONCEPT1',
 'end': 247,
 'id': 4,
 'meta_anns': {},
 'pretty_name': 'CONCEPT1',
 'source_value': 'we can "detect"',
 'start': 232,
 'type_ids': ['T1']}