This example is taken from the ProBound web server, and corresponds to Figure 6 in the original Nature Biotech publication.

This example produces a model of the sequence-specificity of the tyrosine kinase c-Src by training on data from a Kinase-seq experiment, in which a bacterial display peptide library is exposed to c-Src and anti-phosphotyrosine pulldowns are performed at varying timepoints, which are modeled jointly.

This example uses the Adam optimizer with early stopping. Each minibatch is formed by binning sequences by their level of enrichment and sampling from each bin separately to ensure both strong and weak binders are represented in each optimization step.

import torch
import scipy.stats
import matplotlib.pyplot as plt

import pyprobound
import pyprobound.plotting
import pyprobound.fitting

Data specification

alphabet = pyprobound.alphabets.Protein()

url = "http://pbdemo.x3dna.org/files/example_data/Kinase/"
dataframe = pyprobound.get_dataframe(
    [
        f"{url}countTable.0.200205_Src-Kinase_5m.tsv.gz",
        f"{url}countTable.1.200205_Src-Kinase_20m.tsv.gz",
        f"{url}countTable.2.200205_Src-Kinase_60m.tsv.gz",
    ]
).iloc[:, [0, 1, 3, 5]]
dataframe.head()

# Create validation split for early stopping
val_dataframe, train_dataframe = pyprobound.sample_dataframe(
    dataframe, frac=0.3, random_state=0
)
len(train_dataframe), len(val_dataframe)

(160403, 68744)

# Sort for EvenSampler
for df in (val_dataframe, train_dataframe):
    df["Enrichment"] = (df.iloc[:, -1] + 1) / (df.iloc[:, 0] + 1)
    df.sort_values("Enrichment", inplace=True)
    df.drop(columns="Enrichment", inplace=True)

train_count_table = pyprobound.CountTable(
    train_dataframe,
    alphabet,
    left_flank="TAGTSVAGQSGQ",
    right_flank="GGQSGQSGDYNK",
    left_flank_length=5,
    right_flank_length=5,
    min_variable_length=min(dataframe.index.str.len()),
    max_variable_length=max(dataframe.index.str.len()),
)

val_count_table = pyprobound.CountTable(
    val_dataframe,
    alphabet,
    left_flank="TAGTSVAGQSGQ",
    right_flank="GGQSGQSGDYNK",
    left_flank_length=5,
    right_flank_length=5,
    min_variable_length=min(dataframe.index.str.len()),
    max_variable_length=max(dataframe.index.str.len()),
)

Model specification

PSAMs

nonspecific = pyprobound.layers.NonSpecific(alphabet=alphabet, name="NS")
psam = pyprobound.layers.PSAM(
    kernel_size=7,
    alphabet=alphabet,
    pairwise_distance=6,
    seed=["***Y***"],
    seed_scale=45,
    name="Src",
)

Modes

modes = [
    pyprobound.Mode.from_nonspecific(nonspecific, train_count_table),
    pyprobound.Mode.from_psam(psam, train_count_table),
]

Rounds

round_0 = pyprobound.rounds.InitialRound()
round_1 = pyprobound.rounds.ExponentialRound.from_binding(
    modes,
    round_0,
    delta=-15,
    train_delta=True,
    target_concentration=0.25,
    train_concentration=True,
    name="5m",
)
round_2 = pyprobound.rounds.ExponentialRound.from_round(
    round_1, round_0, target_concentration=1.0, name="20m"
)
round_3 = pyprobound.rounds.ExponentialRound.from_round(
    round_1, round_0, target_concentration=3.0, name="60m"
)

# Set log contributions to be 0 instead of -inf
for ctrb in round_1.aggregate.contributions:
    ctrb.set_contribution()

Experiments

experiment = pyprobound.Experiment(
    [round_0, round_1, round_2, round_3],
    name="Src",
    counts_per_round=train_count_table.counts_per_round,
)

Model

model = pyprobound.MultiExperimentLoss([experiment], pseudocount=0)

Fitting

optimizer = pyprobound.Optimizer(
    model,
    [train_count_table],
    [val_count_table],
    batch_size=2048,
    greedy_threshold=2e-4,
    device="cpu",
    checkpoint="Src_earlystop.pt",
    output="Src_earlystop.txt",
    sampler=pyprobound.EvenSampler,
    optimizer=torch.optim.Adam,
)

optimizer.train_simultaneous()

tensor(1.0766)

optimizer.reload()

{'time': 'Tue Apr 23 19:14:47 2024',
 'version': '1.3.1',
 'flank_lengths': ((5, 5),)}

Loss

with torch.inference_mode():
    train_loss, train_reg = model([train_count_table])
    print(
        f"Train:\tLoss={train_loss:.3f}, Reg={train_reg:.3f}, Sum={train_loss+train_reg:.3f}"
    )

    val_loss, val_reg = model([val_count_table])
    print(
        f"Val:\tLoss={val_loss:.3f}, Reg={val_reg:.3f}, Sum={val_loss+val_reg:.3f}"
    )

Train:	Loss=1.057, Reg=0.001, Sum=1.058

Val:	Loss=1.076, Reg=0.001, Sum=1.077

Logo

pyprobound.plotting.logo(psam)

Model consistency

pyprobound.plotting.keff_consistency(experiment, train_count_table)

../_images/9650b5974183e8854f3718f87f63d00327538ac4b1b1c20d2600e3c0c24f30d1.png

Validation

HPLC

validation_df = pyprobound.get_dataframe(["Src_HPLC.tsv"])
validation_ct = pyprobound.CountTable(validation_df, alphabet)

observed = validation_ct.target[:, 0]
with torch.inference_mode():
    predicted = torch.exp(round_1.log_aggregate(validation_ct.seqs))
spearman_r = scipy.stats.spearmanr(observed.log(), predicted.log()).statistic
pearson_r = scipy.stats.pearsonr(observed.log(), predicted.log()).statistic
plt.figure(figsize=(5, 5))
plt.scatter(
    predicted, observed, label=f"$r_s$={spearman_r:.3f}, $r$={pearson_r:.3f}"
)
plt.xscale("log")
plt.yscale("log")
plt.xlim(min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1]))
plt.ylim(plt.xlim())
plt.legend()
plt.show()

../_images/a6db040e1aafbdffa193d59b287270295f9cad00c5418a60304dfd36cb56b325.png

fit = pyprobound.fitting.LogFit(
    round_1,
    validation_ct,
    prediction=lambda log_aggregate: log_aggregate,
    device="cpu",
    update_construct=True,
    name="Src HPLC",
)
fit.plot(
    xlabel=r"Predicted $k_{\mathrm{eff}}$",
    ylabel=r"$v_0$",
    labels=[
        "High",
        "F8-Y442",
        "F8-K444R",
        "TGM1-Y312",
        "TGM1-R315L",
        "VHL-Y185",
        "VHL-L184R",
        "HRAS-Y64",
        "HRAS-E63K",
        "CDK5-Y15",
        "CDK5-G16R",
    ],
    colors=(
        ["black"]
        + ["blue"] * 2
        + ["brown"] * 2
        + ["purple"] * 2
        + ["green"] * 2
        + ["red"] * 2
    ),
)

../_images/bc116eb0e8344fdf5bf484ffc11df90bff27b89973a54e5e541f016a3434a599.png