import numpy as np
import pandas as pd
import torch
from torchmetrics.functional import accuracy, auroc, matthews_corrcoef
[docs]class BaseBaseline:
"""Parent of all baseline models."""
def __init__(self, prob: bool = False, **kwargs):
self.prob = prob
[docs] def fit(self, train: pd.DataFrame):
"""Fit the model to the training dataframe. Has to have 'Drug_ID', 'Target_ID' and 'Y' columns."""
raise NotImplementedError()
[docs] def predict_pair(self, prot_id: str, drug_id: str) -> float:
"""Predict the outcome for a pair of a protein and a drug."""
raise NotImplementedError()
[docs] def test_metrics(self, test: pd.DataFrame) -> dict:
"""Calculate the metrics for the test dataframe."""
pred = self.predict(test)
y_hat = torch.tensor(pred["pred"].values)
y = torch.tensor(pred["Y"].values)
return dict(
acc=accuracy(y_hat, y).float(),
auc=auroc(y_hat, y).float(),
mcc=matthews_corrcoef(y_hat, y, num_classes=2).float(),
)
[docs] def predict(self, test: pd.DataFrame) -> pd.DataFrame:
"""Apply prediction to the whole test dataframe."""
test["pred"] = test.apply(lambda x: self.predict_pair(x["Target_ID"], x["Drug_ID"]), axis=1)
return test
[docs] def assess_dataset(self, filename: str, train_frac: float = 0.8, n_runs: int = 10):
"""Assess the performance of the model on a dataset."""
dataset = pd.read_csv(filename, sep="\t")
train = dataset[dataset["split"] == "train"]
val = dataset[dataset["split"] == "val"]
self.fit(train)
metrics = self.test_metrics(val)
print(f"Results\tAcc : {metrics['acc']:.3}\tAUROC: {metrics['auc']:.3}\tMCC: {metrics['mcc']:.3}")