How to create and use a (core) component with medcat v2¶
The overall process is quite simple:
- Implement and extend
CoreComponent - Specify the
CoreComponentType - Register the component
Creating component¶
In [ ]:
Copied!
# for init args
from typing import Optional, Any
from medcat.tokenizing.tokenizers import BaseTokenizer
from medcat.vocab import Vocab
from medcat.cdb.cdb import CDB
from medcat.config.config import Ner
# for the component itself
from medcat.components.types import CoreComponentType
from medcat.components.types import AbstractEntityProvidingComponent
from medcat.tokenizing.tokens import MutableDocument, MutableEntity
from medcat.components.ner.vocab_based_annotator import maybe_annotate_name
# for the randomness
import random
class RNG:
def __init__(self, min: int, mean: int, max: int, std: float):
self.min = min
self.mean = mean
self.max = max
self.std = std
def get(self):
num = int(random.normalvariate(self.mean, self.std))
return min(max(self.min, num), self.max)
class RandomNER(AbstractEntityProvidingComponent):
# NOTE: NEED TO IMPLEMENT
name = "RANDOM_NER"
# NOTE: NEED TO IMPLEMENT
@classmethod
def create_new_component(
cls, cnf: Ner, tokenizer: BaseTokenizer, cdb: CDB, vocab: Vocab,
model_load_path: Optional[str]) -> 'RandomNER':
return cls(tokenizer, cdb)
# NOTE: NEED TO IMPLEMENT
# you can specify whatever init args as long as you define them above
def __init__(self, tokenizer: BaseTokenizer, cdb: CDB):
super().__init__()
self.tokenizer = tokenizer
self.cdb = cdb
# this is just for the randomness to kind of make sense
# i.e create an entity for every 10 tokens
self.tkns_per_entity = 10
# random number generator for length of the entity (in tokens)
self.rng_len = RNG(
min=1, mean=4, max=8, std=2
)
# NOTE: NEED TO IMPLEMENT
# the type of core component
def get_type(self) -> CoreComponentType:
return CoreComponentType.ner
# NOTE: NEED TO IMPLEMENT
def predict_entities(self, doc: MutableDocument,
ents: list[MutableEntity] | None = None
) -> list[MutableEntity]:
"""Detect candidates for concepts - linker will then be able
to do the rest. It adds `entities` to the doc.entities and each
entity can have the entity.link_candidates - that the linker
will resolve.
Args:
doc (MutableDocument):
Spacy document to be annotated with named entities.
ents list[MutableEntity] | None = None:
The entties to use. None expected here.
Returns:
doc (MutableDocument):
Spacy document with detected entities.
"""
num_tokens = len(list(doc))
num_ents = num_tokens // self.tkns_per_entity
start_tkn_indices = sorted([random.randint(0, num_tokens - 3)
for _ in range(num_ents)])
end_tkn_indices = [min(start + self.rng_len.get(), num_tokens - 2)
for start in start_tkn_indices]
choose_from = list(self.cdb.name2info.keys())
chosen_name = [random.choice(choose_from) for _ in start_tkn_indices]
ner_ents: list[MutableEntity] = []
for tkn_start_idx, tkn_end_idx, linked_name in zip(start_tkn_indices, end_tkn_indices, chosen_name):
char_start_idx = doc[tkn_start_idx].base.char_index
# NOTE: can only do this since we're never selecting the last token
char_end_idx = doc[tkn_end_idx + 1].base.char_index
cur_tokens = doc.get_tokens(char_start_idx, char_end_idx)
# NOTE: the get_tokens method will only return a MutableEntity if it's been set,
# but nothing should be set before the NER component runs, so it should be
# safe to assume that these are all lists of tokens
# this checks the config (i.e length and stuff) and then annotes
ent = maybe_annotate_name(self.tokenizer, linked_name, cur_tokens, doc, self.cdb, self.cdb.config, len(ner_ents))
if ent:
ner_ents.append(ent)
return ner_ents
# for init args
from typing import Optional, Any
from medcat.tokenizing.tokenizers import BaseTokenizer
from medcat.vocab import Vocab
from medcat.cdb.cdb import CDB
from medcat.config.config import Ner
# for the component itself
from medcat.components.types import CoreComponentType
from medcat.components.types import AbstractEntityProvidingComponent
from medcat.tokenizing.tokens import MutableDocument, MutableEntity
from medcat.components.ner.vocab_based_annotator import maybe_annotate_name
# for the randomness
import random
class RNG:
def __init__(self, min: int, mean: int, max: int, std: float):
self.min = min
self.mean = mean
self.max = max
self.std = std
def get(self):
num = int(random.normalvariate(self.mean, self.std))
return min(max(self.min, num), self.max)
class RandomNER(AbstractEntityProvidingComponent):
# NOTE: NEED TO IMPLEMENT
name = "RANDOM_NER"
# NOTE: NEED TO IMPLEMENT
@classmethod
def create_new_component(
cls, cnf: Ner, tokenizer: BaseTokenizer, cdb: CDB, vocab: Vocab,
model_load_path: Optional[str]) -> 'RandomNER':
return cls(tokenizer, cdb)
# NOTE: NEED TO IMPLEMENT
# you can specify whatever init args as long as you define them above
def __init__(self, tokenizer: BaseTokenizer, cdb: CDB):
super().__init__()
self.tokenizer = tokenizer
self.cdb = cdb
# this is just for the randomness to kind of make sense
# i.e create an entity for every 10 tokens
self.tkns_per_entity = 10
# random number generator for length of the entity (in tokens)
self.rng_len = RNG(
min=1, mean=4, max=8, std=2
)
# NOTE: NEED TO IMPLEMENT
# the type of core component
def get_type(self) -> CoreComponentType:
return CoreComponentType.ner
# NOTE: NEED TO IMPLEMENT
def predict_entities(self, doc: MutableDocument,
ents: list[MutableEntity] | None = None
) -> list[MutableEntity]:
"""Detect candidates for concepts - linker will then be able
to do the rest. It adds `entities` to the doc.entities and each
entity can have the entity.link_candidates - that the linker
will resolve.
Args:
doc (MutableDocument):
Spacy document to be annotated with named entities.
ents list[MutableEntity] | None = None:
The entties to use. None expected here.
Returns:
doc (MutableDocument):
Spacy document with detected entities.
"""
num_tokens = len(list(doc))
num_ents = num_tokens // self.tkns_per_entity
start_tkn_indices = sorted([random.randint(0, num_tokens - 3)
for _ in range(num_ents)])
end_tkn_indices = [min(start + self.rng_len.get(), num_tokens - 2)
for start in start_tkn_indices]
choose_from = list(self.cdb.name2info.keys())
chosen_name = [random.choice(choose_from) for _ in start_tkn_indices]
ner_ents: list[MutableEntity] = []
for tkn_start_idx, tkn_end_idx, linked_name in zip(start_tkn_indices, end_tkn_indices, chosen_name):
char_start_idx = doc[tkn_start_idx].base.char_index
# NOTE: can only do this since we're never selecting the last token
char_end_idx = doc[tkn_end_idx + 1].base.char_index
cur_tokens = doc.get_tokens(char_start_idx, char_end_idx)
# NOTE: the get_tokens method will only return a MutableEntity if it's been set,
# but nothing should be set before the NER component runs, so it should be
# safe to assume that these are all lists of tokens
# this checks the config (i.e length and stuff) and then annotes
ent = maybe_annotate_name(self.tokenizer, linked_name, cur_tokens, doc, self.cdb, self.cdb.config, len(ner_ents))
if ent:
ner_ents.append(ent)
return ner_ents
Registering the component¶
In [ ]:
Copied!
from medcat.components.types import register_core_component
# NOTE: in MedCAT v2.5 and onwards, you can import and use `lazy_register_core_component`
# to do lazy registartion instead
register_core_component(CoreComponentType.ner, RandomNER.name, RandomNER.create_new_component)
from medcat.components.types import register_core_component
# NOTE: in MedCAT v2.5 and onwards, you can import and use `lazy_register_core_component`
# to do lazy registartion instead
register_core_component(CoreComponentType.ner, RandomNER.name, RandomNER.create_new_component)
Using custom component¶
In [ ]:
Copied!
from medcat.config.config import Config
from medcat.preprocessors.cleaners import NameDescriptor
from medcat.cat import CAT
import numpy as np
from pprint import pprint
# start with a config
cnf = Config()
# NOTE: the default is to use regex tokenizer
# set the new/registered component
cnf.components.ner.comp_name = RandomNER.name
# creating an empty Vocab - we don't really need it for this demonstration
vocab = Vocab()
print("Vocab (should be empty):", vocab.vocab)
# create a (random!) CDB
rndom_concepts = {
"C01": (
{"CONCEPT1": NameDescriptor(
tokens=["CONCEPT", "1"],
snames=["concept", "1"],
raw_name="CONCEPT1",
is_upper=True)
}, {"ONT1",}, "Concept 1 description"
),
"C02": (
{"CONCEPT2": NameDescriptor(
tokens=["CONCEPT", "2"],
snames=["concept", "2"],
raw_name="CONCEPT2",
is_upper=True)
}, {"ONT1",}, "Concept 2 description"
)
}
cdb = CDB(cnf)
for cui, (names, ontologies, descr) in rndom_concepts.items():
cdb._add_concept(cui=cui, names=names, ontologies=ontologies,
name_status='P', type_ids=['T1'],
description=descr, full_build=True)
print("CDB cui2info", cdb.cui2info)
print("CDB name2info", cdb.cui2info)
# creeate CAT
cat = CAT(cdb, vocab)
print("Got CAT")
print("Verifying the type of component we're using")
print("NER:", cat._pipeline.get_component(CoreComponentType.ner))
text = """Some friends are concept1, but most foes is concept2.
We can have a much longer conversation about concepts1 and concepts2 which is
not going to get us anywhere we are not already.
This is just filler stuff for the concepts so that we can "detect" multiple ones.
"""
ents = cat.get_entities(text)
print("ENTITIES")
for ent in ents["entities"].values():
pprint(ent)
from medcat.config.config import Config
from medcat.preprocessors.cleaners import NameDescriptor
from medcat.cat import CAT
import numpy as np
from pprint import pprint
# start with a config
cnf = Config()
# NOTE: the default is to use regex tokenizer
# set the new/registered component
cnf.components.ner.comp_name = RandomNER.name
# creating an empty Vocab - we don't really need it for this demonstration
vocab = Vocab()
print("Vocab (should be empty):", vocab.vocab)
# create a (random!) CDB
rndom_concepts = {
"C01": (
{"CONCEPT1": NameDescriptor(
tokens=["CONCEPT", "1"],
snames=["concept", "1"],
raw_name="CONCEPT1",
is_upper=True)
}, {"ONT1",}, "Concept 1 description"
),
"C02": (
{"CONCEPT2": NameDescriptor(
tokens=["CONCEPT", "2"],
snames=["concept", "2"],
raw_name="CONCEPT2",
is_upper=True)
}, {"ONT1",}, "Concept 2 description"
)
}
cdb = CDB(cnf)
for cui, (names, ontologies, descr) in rndom_concepts.items():
cdb._add_concept(cui=cui, names=names, ontologies=ontologies,
name_status='P', type_ids=['T1'],
description=descr, full_build=True)
print("CDB cui2info", cdb.cui2info)
print("CDB name2info", cdb.cui2info)
# creeate CAT
cat = CAT(cdb, vocab)
print("Got CAT")
print("Verifying the type of component we're using")
print("NER:", cat._pipeline.get_component(CoreComponentType.ner))
text = """Some friends are concept1, but most foes is concept2.
We can have a much longer conversation about concepts1 and concepts2 which is
not going to get us anywhere we are not already.
This is just filler stuff for the concepts so that we can "detect" multiple ones.
"""
ents = cat.get_entities(text)
print("ENTITIES")
for ent in ents["entities"].values():
pprint(ent)
Training was enabled during inference. It was automatically disabled.
Vocab (should be empty): {}
CDB cui2info {'C01': CUIInfo(cui='C01', preferred_name='CONCEPT1', names={'CONCEPT1'}, subnames={'1', 'concept'}, type_ids=['T1'], description='Concept 1 description', original_names={'CONCEPT1'}, tags=[], group=None, in_other_ontology={'ontologies': {'ONT1'}}, count_train=0, context_vectors=None, average_confidence=0.0), 'C02': CUIInfo(cui='C02', preferred_name='CONCEPT2', names={'CONCEPT2'}, subnames={'concept', '2'}, type_ids=['T1'], description='Concept 2 description', original_names={'CONCEPT2'}, tags=[], group=None, in_other_ontology={'ontologies': {'ONT1'}}, count_train=0, context_vectors=None, average_confidence=0.0)}
CDB name2info {'C01': CUIInfo(cui='C01', preferred_name='CONCEPT1', names={'CONCEPT1'}, subnames={'1', 'concept'}, type_ids=['T1'], description='Concept 1 description', original_names={'CONCEPT1'}, tags=[], group=None, in_other_ontology={'ontologies': {'ONT1'}}, count_train=0, context_vectors=None, average_confidence=0.0), 'C02': CUIInfo(cui='C02', preferred_name='CONCEPT2', names={'CONCEPT2'}, subnames={'concept', '2'}, type_ids=['T1'], description='Concept 2 description', original_names={'CONCEPT2'}, tags=[], group=None, in_other_ontology={'ontologies': {'ONT1'}}, count_train=0, context_vectors=None, average_confidence=0.0)}
Got CAT
Verifying the type of component we're using
NER: <__main__.RandomNER object at 0x10a47f670>
ENTITIES
{'acc': 1,
'context_center': [],
'context_left': [],
'context_right': [],
'context_similarity': 1,
'cui': 'C01',
'detected_name': 'CONCEPT1',
'end': 92,
'id': 0,
'meta_anns': {},
'pretty_name': 'CONCEPT1',
'source_value': 'concept2.\nWe can have a much longer conversation',
'start': 44,
'type_ids': ['T1']}
{'acc': 1,
'context_center': [],
'context_left': [],
'context_right': [],
'context_similarity': 1,
'cui': 'C02',
'detected_name': 'CONCEPT2',
'end': 231,
'id': 2,
'meta_anns': {},
'pretty_name': 'CONCEPT2',
'source_value': 'just filler stuff for the concepts so that',
'start': 189,
'type_ids': ['T1']}
{'acc': 1,
'context_center': [],
'context_left': [],
'context_right': [],
'context_similarity': 1,
'cui': 'C01',
'detected_name': 'CONCEPT1',
'end': 151,
'id': 1,
'meta_anns': {},
'pretty_name': 'CONCEPT1',
'source_value': 'going to get us',
'start': 136,
'type_ids': ['T1']}
{'acc': 1,
'context_center': [],
'context_left': [],
'context_right': [],
'context_similarity': 1,
'cui': 'C01',
'detected_name': 'CONCEPT1',
'end': 247,
'id': 4,
'meta_anns': {},
'pretty_name': 'CONCEPT1',
'source_value': 'we can "detect"',
'start': 232,
'type_ids': ['T1']}