mbrs.metrics package

mbrs.metrics package#

Submodules#

Module contents#

class mbrs.metrics.Metric(cfg: Config)[source]#

Bases: MetricBase

Base metric class.

corpus_score(hypotheses: list[str], references_lists: list[list[str]], sources: list[str] | None = None) → float[source]#

Calculate the corpus-level score.

Parameters:

hypotheses (list[str]) – Hypotheses.
references_lists (list[list[str]]) – Lists of references.
sources (list[str], optional) – Sources.

Returns:

The corpus score.

Return type:

float

expected_scores(hypotheses: list[str], references: list[str], source: str | None = None, reference_lprobs: Tensor | None = None) → Tensor[source]#

Calculate the expected scores for each hypothesis.

Parameters:

hypotheses (list[str]) – Hypotheses.
references (list[str]) – References.
source (str, optional) – A source.
reference_lprobs (Tensor, optional) – Log-probabilities for each reference sample. The shape must be (len(references),). See https://arxiv.org/abs/2311.05263.

Returns:

The expected scores for each hypothesis.

Return type:

Tensor

pairwise_scores(hypotheses: list[str], references: list[str], source: str | None = None) → Tensor[source]#

Calculate the pairwise scores.

Parameters:

hypotheses (list[str]) – Hypotheses.
references (list[str]) – References.
source (str, optional) – A source.

Returns:

Score matrix of shape (H, R), where H is the number: of hypotheses and R is the number of references.

Return type:

Tensor

abstract score(hypothesis: str, reference: str, source: str | None = None) → float[source]#

Calculate the score of the given hypothesis.

Parameters:

hypothesis (str) – A hypothesis.
reference (str) – A reference.
source (str, optional) – A source.

Returns:

The score of the given hypothesis.

Return type:

float

scores(hypotheses: list[str], references: list[str], sources: list[str] | None = None) → Tensor[source]#

Calculate the scores of the given hypotheses.

Parameters:

hypotheses (list[str]) – N hypotheses.
references (list[str]) – N references.
sources (list[str], optional) – N sources.

Returns:

The N scores of the given hypotheses.

Return type:

Tensor

class mbrs.metrics.MetricAggregatable(cfg: Config)[source]#

Bases: Metric

Base class for aggregatable metrics.

This class supports reference aggregation.

abstract expected_scores_reference_aggregation(hypotheses: list[str], references: list[str], source: str | None = None, reference_lprobs: Tensor | None = None) → Tensor[source]#

Calculate the expected scores for each hypothesis.

Parameters:

hypotheses (list[str]) – Hypotheses.
references (list[str]) – References.
source (str, optional) – A source.
reference_lprobs (Tensor, optional) – Log-probabilities for each reference sample. The shape must be (len(references),). See https://arxiv.org/abs/2311.05263.

Returns:

The expected scores for each hypothesis.

Return type:

Tensor

class mbrs.metrics.MetricAggregatableCache(cfg: Config)[source]#

Bases: MetricAggregatable, MetricCacheable

Base class for metrics that can aggregate the cache.

This class supports to aggregate intermediate representations of sentences.

class Cache[source]#

Bases: Cache

Intermediate representations of sentences.

abstract aggregate(reference_lprobs: Tensor | None = None) → Cache[source]#

Aggregate the cached representations.

Parameters:: reference_lprobs (Tensor, optional) – Log-probabilities for each reference sample. The shape must be (len(references),). See https://arxiv.org/abs/2311.05263.
Returns:: An aggregated representation.
Return type:: Cache

cluster(kmeans: Kmeans) → tuple[Cache, Tensor][source]#

Cluster the cached representations.

Parameters:

kmeans (Kmeans) – k-means class to perform clustering.

Returns:

Cache: Centroid representations.
Tensor: N assigned IDs.

Return type:

tuple[Cache, Tensor]

expected_scores_reference_aggregation(hypotheses: list[str], references: list[str], source: str | None = None, reference_lprobs: Tensor | None = None) → Tensor[source]#

Calculate the expected scores for each hypothesis.

Parameters:

hypotheses (list[str]) – Hypotheses.
references (list[str]) – References.
source (str, optional) – A source.
reference_lprobs (Tensor, optional) – Log-probabilities for each reference sample. The shape must be (len(references),). See https://arxiv.org/abs/2311.05263.

Returns:

The expected scores for each hypothesis.

Return type:

Tensor

class mbrs.metrics.MetricBERTScore(cfg: Config)[source]#

Bases: MetricCacheable

BERTScore metric class.

class Cache(embeddings: list[Tensor], idf_weights: list[Tensor])[source]#

Bases: Cache

Intermediate representations of sentences.

embeddings (list[Tensor]): A list of token embeddings of shape (T, D),
where T is the length of sequence, and D is a size of the embedding.
idf_weights (list[Tensor]): A list of IDF weights of shape (T,).

embeddings: list[Tensor]#

idf_weights: list[Tensor]#

repeat(n: int) → Cache[source]#

Repeat the representations by n times.

Parameters:: n (int) – The number of repetition.
Returns:: The repeated cache.
Return type:: Cache

class Config(score_type: BERTScoreScoreType = BERTScoreScoreType.f1, model_type: str | None = None, num_layers: int | None = None, batch_size: int = 64, nthreads: int = 4, idf: bool = False, idf_sents: list[str] | None = None, lang: str | None = None, rescale_with_baseline: bool = False, baseline_path: str | None = None, use_fast_tokenizer: bool = False, fp16: bool = False, bf16: bool = False, cpu: bool = False)[source]#

Bases: Config

BERTScore metric configuration.

score_type (BERTScoreScoreType): The output score type, i.e.,
precision, recall, or f1.
model_type (str): Contexual embedding model specification, default using the
suggested model for the target langauge; has to specify at least one of model_type or lang.
num_layers (int): The layer of representation to use. Default using the number
of layer tuned on WMT16 correlation data.
idf (bool): A booling to specify whether to use idf or not. (This should be
True even if idf_sents is given.)
idf_sents (list[str]): List of sentences used to compute the idf weights.
batch_size (int): Bert score processing batch size
nthreads (int): Number of threads.
lang (str): Language of the sentences; has to specify at least one of
model_type or lang. lang needs to be specified when rescale_with_baseline is True.
rescale_with_baseline (bool): Rescale bertscore with pre-computed baseline.
baseline_path (str): Customized baseline file.
use_fast_tokenizer (bool): use_fast parameter passed to HF tokenizer.
fp16 (bool): Use float16 for the forward computation.
bf16 (bool): Use bfloat16 for the forward computation.
cpu (bool): Use CPU for the forward computation.

baseline_path: str | None = None#

batch_size: int = 64#

bf16: bool = False#

cpu: bool = False#

fp16: bool = False#

idf: bool = False#

idf_sents: list[str] | None = None#

lang: str | None = None#

model_type: str | None = None#

nthreads: int = 4#

num_layers: int | None = None#

rescale_with_baseline: bool = False#

score_type: BERTScoreScoreType = 2#

use_fast_tokenizer: bool = False#

cfg: Config#

corpus_score(hypotheses: list[str], references_lists: list[list[str]], sources: list[str] | None = None) → float[source]#

Calculate the corpus-level score.

Parameters:

hypotheses (list[str]) – Hypotheses.
references_lists (list[list[str]]) – Lists of references.
sources (list[str], optional) – Sources.

Returns:

The corpus score.

Return type:

float

property device: device#: Returns the device of the model.

property embed_dim: int#: Return the size of embedding dimension.

encode(sentences: list[str]) → Cache[source]#

Encode the given sentences into their intermediate representations.

Parameters:

sentences (list[str]) – Input sentences.

Returns:

Intermediate representations of shape (N, D) where N is the: number of hypotheses and D is a size of the embedding dimension.

Return type:

Tensor

out_proj(hypotheses_ir: Cache, references_ir: Cache, sources_ir: Cache | None = None) → Tensor[source]#

Forward the output projection layer.

Parameters:

hypotheses_ir (Cache) – N intermediate representations of hypotheses.
references_ir (Cache) – N intermediate representations of references.
sources_ir (Cache, optional) – N intermediate representations of sources.

Returns:

N scores.

Return type:

Tensor

pad_sequence(tensors: list[Tensor]) → Tensor[source]#

pairwise_scores(hypotheses: list[str], references: list[str], *_, **__) → Tensor[source]#

Calculate the pairwise scores.

Parameters:

hypotheses (list[str]) – Hypotheses.
references (list[str]) – References.

Returns:

Score matrix of shape (H, R), where H is the number: of hypotheses and R is the number of references.

Return type:

Tensor

scores(hypotheses: list[str], references: list[str], *_, **__) → Tensor[source]#

Calculate the scores of the given hypothesis.

Parameters:

hypotheses (list[str]) – N hypotheses.
references (list[str]) – N references.

Returns:

The N scores of the given hypotheses.

Return type:

Tensor

class mbrs.metrics.MetricBLEU(cfg: Config)[source]#

Bases: MetricAggregatable

BLEU metric class.

class AggregatedReference(ngrams: Counter[tuple[str, ...]], length: float)[source]#

Bases: object

Aggregated reference representation.

ngrams (Counter[tuple[str, …]]): Bags of expected n-gram counts.
length (float): Expected length of references.

length: float#

ngrams: Counter[tuple[str, ...]]#

class Config(lowercase: bool = False, force: bool = False, tokenize: str | None = None, smooth_method: str = 'exp', smooth_value: float | None = None, max_ngram_order: int = 4, effective_order: bool = True, trg_lang: str = '', num_workers: int = 8)[source]#

Bases: Config

BLEU metric configuration.

lowercase (bool): If True, lowercased BLEU is computed.
force (bool): Ignore data that looks already tokenized.
tokenize (str, optional): The tokenizer to use. If None, defaults to language-specific tokenizers with ‘13a’ as the fallback default.
smooth_method (str): The smoothing method to use (‘floor’, ‘add-k’, ‘exp’ or ‘none’).
smooth_value (float, optional): The smoothing value for floor and add-k methods. None falls back to default value.
max_ngram_order (int): If given, it overrides the maximum n-gram order (default: 4) when computing precisions.
effective_order (bool): If True, stop including n-gram orders for which precision is 0. This should be True, if sentence-level BLEU will be computed. (default: True)
trg_lang (str): An optional language code to raise potential tokenizer warnings.
num_workers (int): Number of workers for multiprocessing.

effective_order: bool = True#

force: bool = False#

lowercase: bool = False#

max_ngram_order: int = 4#

num_workers: int = 8#

smooth_method: str = 'exp'#

smooth_value: float | None = None#

tokenize: str | None = None#

trg_lang: str = ''#

cfg: Config#

corpus_score(hypotheses: list[str], references_lists: list[list[str]], sources: list[str] | None = None) → float[source]#

Calculate the corpus-level score.

Parameters:

hypotheses (list[str]) – Hypotheses.
references_lists (list[list[str]]) – Lists of references.
sources (list[str], optional) – Sources.

Returns:

The corpus score.

Return type:

float

expected_scores_reference_aggregation(hypotheses: list[str], references: list[str], source: str | None = None, reference_lprobs: Tensor | None = None) → Tensor[source]#

Calculate the expected scores for each hypothesis.

Parameters:

hypotheses (list[str]) – Hypotheses.
references (list[str]) – References.
source (str, optional) – A source.
reference_lprobs (Tensor, optional) – Log-probabilities for each reference sample. The shape must be (len(references),). See https://arxiv.org/abs/2311.05263.

Returns:

The expected scores for each hypothesis.

Return type:

Tensor

pairwise_scores(hypotheses: list[str], references: list[str], *_, **__) → Tensor[source]#

Calculate the pairwise scores.

Parameters:

hypotheses (list[str]) – Hypotheses.
references (list[str]) – References.

Returns:

Score matrix of shape (H, R), where H is the number: of hypotheses and R is the number of references.

Return type:

Tensor

score(hypothesis: str, reference: str, *_, **__) → float[source]#

Calculate the score of the given hypothesis.

Parameters:

hypothesis (str) – Hypothesis.
reference (str) – Reference.

Returns:

The score of the given hypothesis.

Return type:

float

scores(hypotheses: list[str], references: list[str], *_, **__) → Tensor[source]#

Calculate the scores of the given hypotheses.

Parameters:

hypotheses (list[str]) – N hypotheses.
references (list[str]) – N references.

Returns:

The N scores of the given hypotheses.

Return type:

Tensor

class mbrs.metrics.MetricBLEURT(cfg: Config)[source]#

Bases: Metric

BLEURT metric class.

We employ the PyTorch port version to implement this metric instead of the original version: lucadiliello/bleurt-pytorch (thanks to @lucadiliello)

Available checkpoints:

lucadiliello/BLEURT-20
lucadiliello/BLEURT-20-D12
lucadiliello/BLEURT-20-D3
lucadiliello/BLEURT-20-D6
lucadiliello/bleurt-base-128
lucadiliello/bleurt-base-512
lucadiliello/bleurt-large-128
lucadiliello/bleurt-large-512
lucadiliello/bleurt-tiny-128
lucadiliello/bleurt-tiny-512

class Config(model: str = 'lucadiliello/BLEURT-20-D12', batch_size: int = 64, fp16: bool = False, bf16: bool = False, cpu: bool = False)[source]#

Bases: Config

BLEURT metric configuration.

model (str): Model name or path.
batch_size (int): Batch size.
fp16 (bool): Use float16 for the forward computation.
bf16 (bool): Use bfloat16 for the forward computation.
cpu (bool): Use CPU for the forward computation.

batch_size: int = 64#

bf16: bool = False#

cpu: bool = False#

fp16: bool = False#

model: str = 'lucadiliello/BLEURT-20-D12'#

corpus_score(hypotheses: list[str], references_lists: list[list[str]], sources: list[str] | None = None) → float[source]#

Calculate the corpus-level score.

Parameters:

hypotheses (list[str]) – Hypotheses.
references_lists (list[list[str]]) – Lists of references.
sources (list[str], optional) – Sources.

Returns:

The corpus score.

Return type:

float

property device: device#: Returns the device of the model.

pairwise_scores(hypotheses: list[str], references: list[str], *_, **__) → Tensor[source]#

Calculate the pairwise scores.

Parameters:

hypotheses (list[str]) – Hypotheses.
references (list[str]) – References.

Returns:

Score matrix of shape (H, R), where H is the number: of hypotheses and R is the number of references.

Return type:

Tensor

score(hypothesis: str, reference: str, *_, **__) → float[source]#

Calculate the score of the given hypothesis.

Parameters:

hypothesis (str) – A hypothesis.
reference (str) – A reference.

Returns:

The score of the given hypothesis.

Return type:

float

scorer: BleurtForSequenceClassification#

scores(hypotheses: list[str], references: list[str], *_, **__) → Tensor[source]#

Calculate the scores of the given hypothesis.

Parameters:

hypotheses (list[str]) – N hypotheses.
references (list[str]) – N references.

Returns:

The N scores of the given hypotheses.

Return type:

Tensor

class mbrs.metrics.MetricBase(cfg: Config)[source]#

Bases: ABC

Base metric class.

class Config[source]#: Bases: object

HIGHER_IS_BETTER: bool = True#

property device: device#: Returns the device of the metric object.

class mbrs.metrics.MetricCOMET(cfg: Config)[source]#

Bases: MetricAggregatableCache

COMET metric class.

class Cache(embeddings: Tensor)[source]#

Bases: Cache

Intermediate representations of sentences.

embeddings (Tensor): Sentence embeddings of shape (N, D), where N
is the number of sentences and D is a size of the embedding dimension.

aggregate(reference_lprobs: Tensor | None = None) → Cache[source]#

Aggregate the cached representations.

Parameters:: reference_lprobs (Tensor, optional) – Log-probabilities for each reference sample. The shape must be (len(references),). See https://arxiv.org/abs/2311.05263.
Returns:: An aggregated representation.
Return type:: Cache

cluster(kmeans: Kmeans) → tuple[Cache, Tensor][source]#

Cluster the cached representations.

Parameters:

kmeans (Kmeans) – k-means class to perform clustering.

Returns:

Cache: Centroid representations.
Tensor: N assigned IDs.

Return type:

tuple[Cache, Tensor]

embeddings: Tensor#

repeat(n: int) → Cache[source]#

Repeat the representations by n times.

Parameters:: n (int) – The number of repetition.
Returns:: The repeated cache.
Return type:: Cache

class Config(model: str = 'Unbabel/wmt22-comet-da', batch_size: int = 64, fp16: bool = False, bf16: bool = False, cpu: bool = False)[source]#

Bases: Config

COMET metric configuration.

model (str): Model name or path.
batch_size (int): Batch size.
fp16 (bool): Use float16 for the forward computation.
bf16 (bool): Use bfloat16 for the forward computation.
cpu (bool): Use CPU for the forward computation.

batch_size: int = 64#

bf16: bool = False#

cpu: bool = False#

fp16: bool = False#

model: str = 'Unbabel/wmt22-comet-da'#

corpus_score(hypotheses: list[str], references_lists: list[list[str]], sources: list[str] | None = None) → float[source]#

Calculate the corpus-level score.

Parameters:

hypotheses (list[str]) – Hypotheses.
references_lists (list[list[str]]) – Lists of references.
sources (list[str], optional) – Sources.

Returns:

The corpus score.

Return type:

float

Raises:

ValueError – Raise this error when sources are not given.

property device: device#: Returns the device of the model.

property embed_dim: int#: Return the size of embedding dimension.

encode(sentences: list[str]) → Cache[source]#

Encode the given sentences into their intermediate representations.

Parameters:: sentences (list[str]) – Input sentences.
Returns:: Intermediate representations.
Return type:: MetricCOMET.Cache

out_proj(hypotheses_ir: Cache, references_ir: Cache, sources_ir: Cache) → Tensor[source]#

Forward the output projection layer.

Parameters:

hypotheses_ir (Cache) – N intermediate representations of hypotheses.
references_ir (Cache) – N intermediate representations of references.
sources_ir (Cache, optional) – N intermediate representations of sources.

Returns:

N scores.

Return type:

Tensor

class mbrs.metrics.MetricCOMETkiwi(cfg: Config)[source]#

Bases: MetricReferenceless

COMETkiwi metric class.

class Config(model: str = 'Unbabel/wmt22-cometkiwi-da', batch_size: int = 64, fp16: bool = False, bf16: bool = False, cpu: bool = False)[source]#

Bases: Config

COMETkiwi metric configuration.

model (str): Model name or path.
batch_size (int): Batch size.
fp16 (bool): Use float16 for the forward computation.
bf16 (bool): Use bfloat16 for the forward computation.
cpu (bool): Use CPU for the forward computation.

batch_size: int = 64#

bf16: bool = False#

cpu: bool = False#

fp16: bool = False#

model: str = 'Unbabel/wmt22-cometkiwi-da'#

corpus_score(hypotheses: list[str], sources: list[str]) → float[source]#

Calculate the corpus-level score.

Parameters:

hypotheses (list[str]) – Hypotheses.
source (list[str]) – Sources.

Returns:

The corpus score.

Return type:

float

property device: device#: Returns the device of the model.

score(hypothesis: str, source: str) → float[source]#

Calculate the score of the given hypothesis.

Parameters:

hypothesis (str) – A hypothesis.
source (str) – A source.

Returns:

The score of the given hypothesis.

Return type:

float

scores(hypotheses: list[str], sources: list[str]) → Tensor[source]#

Calculate the scores of hypotheses.

Parameters:

hypotheses (list[str]) – N hypotheses.
source (list[str]) – N sources.

Returns:

N scores of the given hypotheses.

Return type:

torch.Tensor

class mbrs.metrics.MetricCacheable(cfg: Config)[source]#

Bases: Metric

Base class for cacheable metrics.

This class supports to cache intermediate representations of sentences.

class Cache[source]#

Bases: object

Intermediate representations of sentences.

abstract repeat(n: int) → Cache[source]#

Repeat the representations by n times.

Parameters:: n (int) – The number of repetition.
Returns:: The repeated cache.
Return type:: Cache

abstract property embed_dim: int#: Return the size of embedding dimension.

abstract encode(sentences: list[str]) → Cache[source]#

Encode the given sentences into their intermediate representations.

Parameters:: sentences (list[str]) – Input sentences.
Returns:: Intermediate representations.
Return type:: MetricCacheable.Cache

abstract out_proj(hypotheses_ir: Cache, references_ir: Cache, sources_ir: Cache | None = None) → Tensor[source]#

Forward the output projection layer.

Parameters:

hypotheses_ir (Cache) – N intermediate representations of hypotheses.
references_ir (Cache) – N intermediate representations of references.
sources_ir (Cache, optional) – N intermediate representations of sources.

Returns:

N scores.

Return type:

Tensor

pairwise_scores(hypotheses: list[str], references: list[str], source: str | None = None) → Tensor[source]#

Calculate the pairwise scores.

Parameters:

hypotheses (list[str]) – Hypotheses.
references (list[str]) – References.
source (str, optional) – A source.

Returns:

Score matrix of shape (H, R), where H is the number: of hypotheses and R is the number of references.

Return type:

Tensor

pairwise_scores_from_ir(hypotheses_ir: Cache, references_ir: Cache, source_ir: Cache | None = None) → Tensor[source]#

Calculate the pairwise scores from the intermediate representations.

Parameters:

hypotheses_ir (Cache) – Hypotheses.
references_ir (Cache) – References.
source_ir (Cache, optional) – A source.

Returns:

Score matrix of shape (H, R), where H is the number: of hypotheses and R is the number of references.

Return type:

Tensor

score(hypothesis: str, reference: str, source: str | None = None) → float[source]#

Calculate the score of the given hypothesis.

Parameters:

hypothesis (str) – A hypothesis.
reference (str) – A reference.
source (str, optional) – A source.

Returns:

The score of the given hypothesis.

Return type:

float

scores(hypotheses: list[str], references: list[str], sources: list[str] | None = None) → Tensor[source]#

Calculate the scores of the given hypotheses.

Parameters:

hypotheses (list[str]) – N hypotheses.
references (list[str]) – N references.
source (list[str], optional) – N sources.

Returns:

The N scores of the given hypotheses.

Return type:

Tensor

scores_from_ir(hypotheses_ir: Cache, references_ir: Cache, sources_ir: Cache | None = None) → Tensor[source]#

Calculate the scores of the given hypotheses from the intermediate representations.

Parameters:

hypotheses_ir (Cache) – N hypotheses.
references_ir (Cache) – N references.
sources_ir (Cache, optional) – N sources.

Returns:

The N scores of the given hypotheses.

Return type:

Tensor

class mbrs.metrics.MetricChrF(cfg: Config)[source]#

Bases: MetricAggregatable

ChrF metric class.

class AggregatedReference(ngrams: list[Counter])[source]#

Bases: object

Aggregated reference representation.

ngrams (list[Counter]]): Bags of n-grams for each order.

ngrams: list[Counter]#

class Config(char_order: int = 6, word_order: int = 0, beta: int = 2, lowercase: bool = False, whitespace: bool = False, eps_smoothing: bool = False, num_workers: int = 8, fastchrf: bool = False)[source]#

Bases: Config

ChrF metric configuration.

char_order (int): Character n-gram order.
word_order (int): Word n-gram order. If equals to 2, the metric is referred to as chrF++.
beta (int): Determine the importance of recall w.r.t precision.
lowercase (bool): Enable case-insensitivity.
whitespace (bool): If True, include whitespaces when extracting character n-grams.
eps_smoothing (bool): If True, applies epsilon smoothing similar to reference chrF++.py, NLTK and Moses implementations.
Otherwise, it takes into account effective match order similar to sacreBLEU < 2.0.0.
num_workers (int): Number of workers for multiprocessing.
fastchrf (bool): Use the rust implementation of chrF.

beta: int = 2#

char_order: int = 6#

eps_smoothing: bool = False#

fastchrf: bool = False#

lowercase: bool = False#

num_workers: int = 8#

whitespace: bool = False#

word_order: int = 0#

cfg: Config#

corpus_score(hypotheses: list[str], references_lists: list[list[str]], sources: list[str] | None = None) → float[source]#

Calculate the corpus-level score.

Parameters:

hypotheses (list[str]) – Hypotheses.
references_lists (list[list[str]]) – Lists of references.
sources (list[str], optional) – Sources.

Returns:

The corpus score.

Return type:

float

expected_scores_reference_aggregation(hypotheses: list[str], references: list[str], source: str | None = None, reference_lprobs: Tensor | None = None) → Tensor[source]#

Calculate the expected scores for each hypothesis.

Parameters:

hypotheses (list[str]) – Hypotheses.
references (list[str]) – References.
source (str, optional) – A source.
reference_lprobs (Tensor, optional) – Log-probabilities for each reference sample. The shape must be (len(references),). See https://arxiv.org/abs/2311.05263.

Returns:

The expected scores for each hypothesis.

Return type:

Tensor

pairwise_scores(hypotheses: list[str], references: list[str], *_, **__) → Tensor[source]#

Calculate the pairwise scores.

Parameters:

hypotheses (list[str]) – Hypotheses.
references (list[str]) – References.

Returns:

Score matrix of shape (H, R), where H is the number: of hypotheses and R is the number of references.

Return type:

Tensor

score(hypothesis: str, reference: str, *_, **__) → float[source]#

Calculate the score of the given hypothesis.

Parameters:

hypothesis (str) – Hypothesis.
reference (str) – Reference.

Returns:

The score of the given hypothesis.

Return type:

float

scores(hypotheses: list[str], references: list[str], *_, **__) → Tensor[source]#

Calculate the scores of the given hypotheses.

Parameters:

hypotheses (list[str]) – N hypotheses.
references (list[str]) – N references.

Returns:

The N scores of the given hypotheses.

Return type:

Tensor

class mbrs.metrics.MetricMetricX(cfg: Config)[source]#

Bases: Metric

MetricX metric class.

References: - MetricX-23: https://aclanthology.org/2023.wmt-1.63 - MetricX-24: https://aclanthology.org/2024.wmt-1.35

Available checkpoints:

google/metricx-24-hybrid-xxl-v2p6
google/metricx-24-hybrid-xl-v2p6
google/metricx-24-hybrid-large-v2p6
google/metricx-23-xxl-v2p0
google/metricx-23-xl-v2p0
google/metricx-23-large-v2p0
google/metricx-23-qe-xxl-v2p0
google/metricx-23-qe-xl-v2p0
google/metricx-23-qe-large-v2p0

class Config(model: str = 'google/metricx-24-hybrid-xxl-v2p6', batch_size: int = 8, fp16: bool = False, bf16: bool = False, cpu: bool = False)[source]#

Bases: Config

MetricX metric configuration.

model (str): Model name or path.
batch_size (int): Batch size.
fp16 (bool): Use float16 for the forward computation.
bf16 (bool): Use bfloat16 for the forward computation.
cpu (bool): Use CPU for the forward computation.

batch_size: int = 8#

bf16: bool = False#

cpu: bool = False#

fp16: bool = False#

model: str = 'google/metricx-24-hybrid-xxl-v2p6'#

HIGHER_IS_BETTER: bool = False#

class InputPrefix(hypothesis: 'str', reference: 'str', source: 'str')[source]#

Bases: object

hypothesis: str#

reference: str#

source: str#

METRICX23_QE_MODELS = {'google/metricx-23-qe-large-v2p0', 'google/metricx-23-qe-xl-v2p0', 'google/metricx-23-qe-xxl-v2p0'}#

METRICX_INPUT_LENGTH_MAP = {MetricXVersion.metricx_23: 1024, MetricXVersion.metricx_24: 1536}#

METRICX_INPUT_PREFIX_MAP = {MetricXVersion.metricx_23: MetricMetricX.InputPrefix(hypothesis='candidate: ', reference=' reference: ', source=' source: '), MetricXVersion.metricx_24: MetricMetricX.InputPrefix(hypothesis=' candidate: ', reference=' reference: ', source='source: ')}#

METRICX_VERSION_MAP = {'google/metricx-23-large-v2p0': MetricXVersion.metricx_23, 'google/metricx-23-qe-large-v2p0': MetricXVersion.metricx_23, 'google/metricx-23-qe-xl-v2p0': MetricXVersion.metricx_23, 'google/metricx-23-qe-xxl-v2p0': MetricXVersion.metricx_23, 'google/metricx-23-xl-v2p0': MetricXVersion.metricx_23, 'google/metricx-23-xxl-v2p0': MetricXVersion.metricx_23, 'google/metricx-24-hybrid-large-v2p6': MetricXVersion.metricx_24, 'google/metricx-24-hybrid-xl-v2p6': MetricXVersion.metricx_24, 'google/metricx-24-hybrid-xxl-v2p6': MetricXVersion.metricx_24}#

class MetricXVersion(value)[source]#

Bases: str, Enum

An enumeration.

metricx_23 = 'metricx_23'#

metricx_24 = 'metricx_24'#

corpus_score(hypotheses: list[str], references_lists: list[list[str]] | None = None, sources: list[str] | None = None) → float[source]#

Calculate the corpus-level score.

Parameters:

hypotheses (list[str]) – Hypotheses.
references (list[list[str]], optional) – Lists of references.
sources (list[str], optional) – Sources.

Returns:

The corpus score.

Return type:

float

property device: device#: Returns the device of the model.

pairwise_scores(hypotheses: list[str], references: list[str], source: str | None = None) → Tensor[source]#

Calculate the pairwise scores.

Parameters:

hypotheses (list[str]) – Hypotheses.
references (list[str]) – References.
source (str, optional) – A source.

Returns:

Score matrix of shape (H, R), where H is the number: of hypotheses and R is the number of references.

Return type:

Tensor

score(hypothesis: str, reference: str | None = None, source: str | None = None) → float[source]#

Calculate the score of the given hypothesis.

Parameters:

hypothesis (str) – A hypothesis.
reference (str, optional) – A reference.
source (str, optional) – A source.

Returns:

The score of the given hypothesis.

Return type:

float

scorer: MT5ForRegression#

scores(hypotheses: list[str], references: list[str] | None = None, sources: list[str] | None = None) → Tensor[source]#

Calculate the scores of the given hypothesis.

Parameters:

hypotheses (list[str]) – N hypotheses.
references (list[str], optional) – N references.
sources (list[str], optional) – N sources.

Returns:

The N scores of the given hypotheses.

Return type:

Tensor

class mbrs.metrics.MetricReferenceless(cfg: Config)[source]#

Bases: MetricBase

Base class for reference-less metrics like quality estimation.

corpus_score(hypotheses: list[str], sources: list[str]) → float[source]#

Calculate the corpus-level score.

Parameters:

hypotheses (list[str]) – Hypotheses.
sources (list[str]) – Sources.

Returns:

The corpus score.

Return type:

float

abstract score(hypothesis: str, source: str) → float[source]#

Calculate the score of the given hypothesis.

Parameters:

hypothesis (str) – A hypothesis.
source (str) – A source.

Returns:

The score of the given hypothesis.

Return type:

float

scores(hypotheses: list[str], sources: list[str]) → Tensor[source]#

Calculate the scores of hypotheses.

Parameters:

hypotheses (list[str]) – N hypotheses.
sources (list[str]) – N sources.

Returns:

The scores of hypotheses.

Return type:

Tensor

class mbrs.metrics.MetricTER(cfg: Config)[source]#

Bases: Metric

TER metric class.

class Config(normalized: bool = False, no_punct: bool = False, asian_support: bool = False, case_sensitive: bool = False, num_workers: int = 8)[source]#

Bases: Config

TER metric configuration.

normalized (bool): Enable character normalization. By default, normalizes a couple of things such as newlines being stripped, retrieving XML encoded characters, and fixing tokenization for punctuation. When ‘asian_support’ is enabled, also normalizes specific Asian (CJK) character sequences, i.e. split them down to the character level.
no_punct (bool): Remove punctuation. Can be used in conjunction with ‘asian_support’ to also remove typical punctuation markers in Asian languages (CJK).
asian_support (bool): Enable special treatment of Asian characters. This option only has an effect when ‘normalized’ and/or ‘no_punct’ is enabled. If ‘normalized’ is also enabled, then Asian (CJK) characters are split down to the character level. If ‘no_punct’ is enabled alongside ‘asian_support’, specific unicode ranges for CJK and full-width punctuations are also removed.
case_sensitive (bool): If True, does not lowercase sentences.
num_workers (int): Number of workers for multiprocessing.

asian_support: bool = False#

case_sensitive: bool = False#

no_punct: bool = False#

normalized: bool = False#

num_workers: int = 8#

HIGHER_IS_BETTER: bool = False#

cfg: Config#

corpus_score(hypotheses: list[str], references_lists: list[list[str]], sources: list[str] | None = None) → float[source]#

Calculate the corpus-level score.

Parameters:

hypotheses (list[str]) – Hypotheses.
references_lists (list[list[str]]) – Lists of references.
sources (list[str], optional) – Sources.

Returns:

The corpus score.

Return type:

float

pairwise_scores(hypotheses: list[str], references: list[str], *_, **__) → Tensor[source]#

Calculate the pairwise scores.

Parameters:

hypotheses (list[str]) – Hypotheses.
references (list[str]) – References.

Returns:

Score matrix of shape (H, R), where H is the number: of hypotheses and R is the number of references.

Return type:

Tensor

score(hypothesis: str, reference: str, *_, **__) → float[source]#

Calculate the score of the given hypothesis.

Parameters:

hypothesis (str) – Hypothesis.
reference (str) – Reference.

Returns:

The score of the given hypothesis.

Return type:

float

scores(hypotheses: list[str], references: list[str], *_, **__) → Tensor[source]#

Calculate the scores of the given hypotheses.

Parameters:

hypotheses (list[str]) – N hypotheses.
references (list[str]) – N references.

Returns:

The N scores of the given hypotheses.

Return type:

Tensor

class mbrs.metrics.MetricXCOMET(cfg: Config)[source]#

Bases: Metric

XCOMET metric class.

Both XCOMET (Guerreiro et al., 2024) and XCOMET-lite (Larionov et al., 2024) are supported.

Supported models:

Unbabel/XCOMET-XL
Unbabel/XCOMET-XXL
myyycroft/XCOMET-lite

class Config(model: str = 'Unbabel/XCOMET-XL', batch_size: int = 8, fp16: bool = False, bf16: bool = False, cpu: bool = False)[source]#

Bases: Config

XCOMET metric configuration.

model (str): Model name or path.
batch_size (int): Batch size.
fp16 (bool): Use float16 for the forward computation.
bf16 (bool): Use bfloat16 for the forward computation.
cpu (bool): Use CPU for the forward computation.

batch_size: int = 8#

bf16: bool = False#

cpu: bool = False#

fp16: bool = False#

model: str = 'Unbabel/XCOMET-XL'#

corpus_score(hypotheses: list[str], references_lists: list[list[str]] | None = None, sources: list[str] | None = None) → float[source]#

Calculate the corpus-level score.

Parameters:

hypotheses (list[str]) – Hypotheses.
references_lists (list[list[str]], optional) – Lists of references.
sources (list[str], optional) – Sources.

Returns:

The corpus score.

Return type:

float

property device: device#: Returns the device of the model.

pairwise_scores(hypotheses: list[str], references: list[str], source: str | None = None) → Tensor[source]#

Calculate the pairwise scores.

Parameters:

hypotheses (list[str]) – Hypotheses.
references (list[str]) – References.
source (str, optional) – A source.

Returns:

Score matrix of shape (H, R), where H is the number: of hypotheses and R is the number of references.

Return type:

Tensor

score(hypothesis: str, reference: str | None = None, source: str | None = None) → float[source]#

Calculate the score of the given hypothesis.

Parameters:

hypothesis (str) – A hypothesis.
reference (str, optional) – A reference.
source (str, optional) – A source.

Returns:

The score of the given hypothesis.

Return type:

float

scorer: XCOMETMetric#

scores(hypotheses: list[str], references: list[str] | None = None, sources: list[str] | None = None) → Tensor[source]#

Calculate the scores of the given hypothesis.

Parameters:

hypotheses (list[str]) – N hypotheses.
references (list[str], optional) – N references.
sources (list[str], optional) – N sources.

Returns:

The N scores of the given hypotheses.

Return type:

Tensor

mbrs.metrics.get_metric(name: str) → type[T]#

Get a class type.

Parameters:: name – A registered name.
Returns:: Class type.
Return type:: type[T]

mbrs.metrics.register(name: str) → Callable[[type[T]], type[T]]#

Register a type as the given name.

Parameters:: name (str) – The name of a type.
Returns:: Register decorator function.
Return type:: Callable[[type[T]], type[T]]
Raises:: ValueError – The type is already registered.

mbrs.metrics package

Contents

mbrs.metrics package#

Submodules#

Module contents#