Skip to content

medcat.pipeline.speed_utils

Classes:

Functions:

Attributes:

logger module-attribute

logger = getLogger(__name__)

AveragingTimedComponent

AveragingTimedComponent(component: BaseComponent, condition: Callable[[int, float], bool])

Bases: AveragingTimedObject

Source code in medcat-v2/medcat/pipeline/speed_utils.py
175
176
177
178
179
def __init__(self, component: BaseComponent,
             condition: Callable[[int, float], bool]
             ) -> None:
    super().__init__(component, condition)
    self._component: BaseComponent

AveragingTimedObject

AveragingTimedObject(component: Union[BaseComponent, BaseTokenizer], condition: Callable[[int, float], bool])

Bases: BaseTimedObject

Source code in medcat-v2/medcat/pipeline/speed_utils.py
135
136
137
138
139
def __init__(self, component: Union[BaseComponent, BaseTokenizer],
             condition: Callable[[int, float], bool]):
    super().__init__(component)
    self._condition = condition
    self._reset()

AveragingTimedTokenizer

AveragingTimedTokenizer(component: BaseTokenizer, condition: Callable[[int, float], bool])

Bases: AveragingTimedObject

Source code in medcat-v2/medcat/pipeline/speed_utils.py
191
192
193
194
195
def __init__(self, component: BaseTokenizer,
             condition: Callable[[int, float], bool]
             ) -> None:
    super().__init__(component, condition)
    self._component: BaseTokenizer

BaseTimedComponent

Bases: Protocol

BaseTimedObject

BaseTimedObject(component: Union[BaseComponent, BaseTokenizer])

Attributes:

Source code in medcat-v2/medcat/pipeline/speed_utils.py
63
64
def __init__(self, component: Union[BaseComponent, BaseTokenizer]):
    self._component = component

full_name property

full_name

BaseTimedObjectProtocol

Bases: Protocol

Attributes:

full_name property

full_name: str

BaseTimedTokenizer

Bases: Protocol

PerDocTimedObject

PerDocTimedObject(component: Union[BaseComponent, BaseTokenizer])

Bases: BaseTimedObject

Methods:

Source code in medcat-v2/medcat/pipeline/speed_utils.py
63
64
def __init__(self, component: Union[BaseComponent, BaseTokenizer]):
    self._component = component

time_it

time_it(to_run: Callable[[], MutableDocument]) -> MutableDocument
Source code in medcat-v2/medcat/pipeline/speed_utils.py
102
103
104
105
106
107
def time_it(self, to_run: Callable[[], MutableDocument]) -> MutableDocument:
    start = time.perf_counter()
    result = to_run()
    elapsed_ms = (time.perf_counter() - start) * 1000
    logger.info("Component %s took %.3fms", self.full_name, elapsed_ms)
    return result

ProfiledComponent

ProfiledComponent(component: BaseComponent)

Bases: ProfiledObject

Source code in medcat-v2/medcat/pipeline/speed_utils.py
228
229
230
231
def __init__(self, component: BaseComponent,
             ) -> None:
    super().__init__(component)
    self._component: BaseComponent

ProfiledObject

ProfiledObject(component: Union[BaseComponent, BaseTokenizer])

Bases: BaseTimedObject

Methods:

Source code in medcat-v2/medcat/pipeline/speed_utils.py
207
208
209
def __init__(self, component: Union[BaseComponent, BaseTokenizer]):
    super().__init__(component)
    self._profiler = cProfile.Profile()

show_stats

show_stats(limit: int = 20)
Source code in medcat-v2/medcat/pipeline/speed_utils.py
221
222
223
def show_stats(self, limit: int = 20):
    self._show_type('tottime', limit)
    self._show_type('cumtime', limit)

ProfiledTokenizer

ProfiledTokenizer(component: BaseTokenizer)

Bases: ProfiledObject

Source code in medcat-v2/medcat/pipeline/speed_utils.py
242
243
244
245
def __init__(self, component: BaseTokenizer,
             ) -> None:
    super().__init__(component)
    self._component: BaseTokenizer

TimedComponent

TimedComponent(component: BaseComponent)

Bases: PerDocTimedObject

Wraps a component and logs the time spent in it.

Source code in medcat-v2/medcat/pipeline/speed_utils.py
113
114
115
116
def __init__(self, component: BaseComponent,
             ) -> None:
    super().__init__(component)
    self._component: BaseComponent

TimedComponentProtocol

TimedTokenizer

TimedTokenizer(component: BaseTokenizer)

Bases: PerDocTimedObject

Source code in medcat-v2/medcat/pipeline/speed_utils.py
124
125
126
127
def __init__(self, component: BaseTokenizer,
             ) -> None:
    super().__init__(component)
    self._component: BaseTokenizer

TimedTokenizerProtocol

context_manager_with_logging

context_manager_with_logging(func)
Source code in medcat-v2/medcat/pipeline/speed_utils.py
40
41
42
43
44
45
46
def context_manager_with_logging(func):
    @contextlib.wraps(func)
    @contextlib.contextmanager
    def wrapper(*args, **kwargs):
        with _with_logging():
            yield from func(*args, **kwargs)
    return wrapper

pipeline_per_doc_timer

Time the pipeline on a per document basis.

Parameters:

  • pipeline

    (Pipeline) –

    The pipeline to time.

  • timer_init

    (Callable[[BaseComponent], TimedComponentProtocol]), default: TimedComponent ) –

    The initialiser for the timer. Defaults to TimedComponent.

  • tknzer_timer_init

    (Callable[[BaseTokenizer], TimedTokenizerProtocol, default: TimedTokenizer ) –

    The initialiser for the timer for the tokenizer. Defaults to TimedTokenizer.

Yields:

  • Pipeline

    The same pipeline.

Source code in medcat-v2/medcat/pipeline/speed_utils.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
@context_manager_with_logging
def pipeline_per_doc_timer(
        pipeline: Pipeline,
        timer_init: Callable[[BaseComponent],
                             TimedComponentProtocol] = TimedComponent,
        tknzer_timer_init: Callable[[BaseTokenizer],
                                    TimedTokenizerProtocol] = TimedTokenizer,
    ):
    """Time the pipeline on a per document basis.

    Args:
        pipeline (Pipeline): The pipeline to time.
        timer_init (Callable[[BaseComponent], TimedComponentProtocol])): The
            initialiser for the timer. Defaults to TimedComponent.
        tknzer_timer_init (Callable[[BaseTokenizer], TimedTokenizerProtocol): The
            initialiser for the timer for the tokenizer. Defaults to TimedTokenizer.

    Yields:
        Pipeline: The same pipeline.
    """
    original_tokenizer = pipeline._tokenizer
    original_components = pipeline._components
    original_addons = pipeline._addons

    updated_core_components = [
        cast(CoreComponent, timer_init(c))
        for c in original_components]
    updated_addons = [
        cast(AddonComponent, timer_init(a))
        for a in original_addons]

    pipeline._tokenizer = cast(
        BaseTokenizer, tknzer_timer_init(original_tokenizer))
    pipeline._components = updated_core_components
    pipeline._addons = updated_addons

    try:
        yield pipeline
    finally:
        pipeline._tokenizer = original_tokenizer
        pipeline._components = original_components
        pipeline._addons = original_addons

pipeline_timer_averaging_docs

pipeline_timer_averaging_docs(pipeline: Pipeline, show_frequency_docs: int = -1, show_frequency_secs: float = -1)

Time the pipeline on a multi doc basis.

This can be set to show timings after a certain number of docs or after a certain time spent. The default configuration averages over 100 documents

Parameters:

  • pipeline

    (Pipeline) –

    The pipeline to time.

  • show_frequency_docs

    (int, default: -1 ) –

    The number of documents to average over, or (if set to -1) use seconds instead. Defaults to -1 if secs frequency set and to 100 otherwise.

  • show_frequency_secs

    (float, default: -1 ) –

    The frequency in seconds for showing the average timings for each component. Defaults to -1.

Raises:

  • ValueError

    If one of the frequencies is 0.

  • ValueError

    If both document and time frequencies are specified.

Yields:

  • Pipeline

    The same pipeline.

Source code in medcat-v2/medcat/pipeline/speed_utils.py
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
@context_manager_with_logging
def pipeline_timer_averaging_docs(
        pipeline: Pipeline,
        show_frequency_docs: int = -1,
        show_frequency_secs: float = -1):
    """Time the pipeline on a multi doc basis.

    This can be set to show timings after a certain number of docs or
    after a certain time spent. The default configuration averages over 100
    documents

    Args:
        pipeline (Pipeline): The pipeline to time.
        show_frequency_docs (int): The number of documents to average
            over, or (if set to -1) use seconds instead. Defaults to -1 if
            secs frequency set and to 100 otherwise.
        show_frequency_secs (float): The frequency in seconds for showing the
            average timings for each component. Defaults to -1.

    Raises:
        ValueError: If one of the frequencies is 0.
        ValueError: If both document and time frequencies are specified.


    Yields:
        Pipeline: The same pipeline.
    """
    if show_frequency_docs == 0 or show_frequency_secs == 0:
        raise ValueError(
            "Frequency values must be greater than 0 or -1 (disabled)")
    if show_frequency_docs > 0 and show_frequency_secs > 0:
        raise ValueError("Choose either document frequency OR time frequency")
    if show_frequency_secs == -1 and show_frequency_docs == -1:
        show_frequency_docs = 100

    original_tokenizer = pipeline._tokenizer
    original_components = pipeline._components
    original_addons = pipeline._addons

    def wrapper_condition(num_docs: int, time_spent: float) -> bool:
        if show_frequency_docs >= 0:
            return num_docs >= show_frequency_docs
        return time_spent >= show_frequency_secs

    wrapped_core_comps = [
        AveragingTimedComponent(component, wrapper_condition)
        for component in original_components]
    wrapped_addons = [
        AveragingTimedComponent(addon, wrapper_condition)
        for addon in original_addons]
    wrapped_tokenizer = AveragingTimedTokenizer(
            original_tokenizer, wrapper_condition)

    pipeline._tokenizer = wrapped_tokenizer  # type: ignore
    pipeline._components = wrapped_core_comps  # type: ignore
    pipeline._addons = wrapped_addons  # type: ignore

    try:
        yield pipeline
    finally:
        pipeline._tokenizer = original_tokenizer
        pipeline._components = original_components
        pipeline._addons = original_addons
        timed_objects: list[AveragingTimedObject] = [
            wrapped_tokenizer, *wrapped_core_comps, *wrapped_addons
        ]

        for comp in timed_objects:
            if comp._to_average:
                comp._show_time()
                comp._reset()

profile_pipeline_component

profile_pipeline_component(pipeline: Pipeline, comp_type: Union[CoreComponentType, Type[AddonType], Literal['tokenizer']], limit: int = 20)

Time a specific component of the pipeline.

This can profile either a core component or an addon component. But notably, in case of addon components, all components of the same type will be profiled.

Parameters:

  • pipeline

    (Pipeline) –

    The pipeline to time.

  • comp_type

    (Union[CoreComponentType, Type[AddonType], Literal['tokenizer']]) –

    The type of component to profile. This can be either a core component or an addon component, ot the tokenizer.

  • limit

    (int, default: 20 ) –

    The number of function calls to show in output. Defaults to 20.

Yields:

  • Pipeline

    The same pipeline.

Source code in medcat-v2/medcat/pipeline/speed_utils.py
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
@context_manager_with_logging
def profile_pipeline_component(
        pipeline: Pipeline,
        comp_type: Union[CoreComponentType, Type[AddonType], Literal['tokenizer']],
        limit: int = 20,
    ):
    """Time a specific component of the pipeline.

    This can profile either a core component or an addon component.
    But notably, in case of addon components, all components of the
    same type will be profiled.

    Args:
        pipeline (Pipeline): The pipeline to time.
        comp_type (Union[CoreComponentType, Type[AddonType], Literal['tokenizer']]):
            The type of component to profile. This can be either a core component
            or an addon component, ot the tokenizer.
        limit (int): The number of function calls to show in output.
            Defaults to 20.

    Yields:
        Pipeline: The same pipeline.
    """
    original_tokenizer = pipeline._tokenizer
    original_components = pipeline._components
    original_addons = pipeline._addons

    updated_addons: list[AddonComponent]
    updated_core_comps: list[CoreComponent]
    if isinstance(comp_type, CoreComponentType):
        updated_tokenizer = original_tokenizer
        changed_comp = pipeline.get_component(comp_type)
        updated_core_comps = [
            comp if comp != changed_comp else
            cast(CoreComponent, ProfiledComponent(changed_comp))
            for comp in original_components
        ]
        updated_addons = original_addons
    elif comp_type == 'tokenizer':
        updated_tokenizer = cast(BaseTokenizer, ProfiledTokenizer(original_tokenizer))
        updated_core_comps = original_components
        updated_addons = original_addons
    else:
        updated_tokenizer = original_tokenizer
        changed_comps = [
            addon for addon in pipeline.iter_addons()
            if isinstance(addon, comp_type)
        ]
        updated_core_comps = original_components
        updated_addons = [
            addon if addon not in changed_comps
            else cast(AddonComponent, ProfiledComponent(addon))
            for addon in original_addons
        ]
    profiled_comps = [
        comp for comp in updated_core_comps + updated_addons + [updated_tokenizer,]
        if isinstance(comp, ProfiledObject)
    ]

    pipeline._tokenizer = updated_tokenizer
    pipeline._components = updated_core_comps
    pipeline._addons = updated_addons

    try:
        yield pipeline
    finally:
        pipeline._tokenizer = original_tokenizer
        pipeline._components = original_components
        pipeline._addons = original_addons
        for comp in profiled_comps:
            comp.show_stats(limit)