This example is taken from the ProBound web server, and corresponds to Figure 3a-b in the original Nature Biotech publication.
This example produces models for three Drosophila homeodomain proteins (Ubx, Exd, and Hth) as well as the binding cooperativity of the trimer (while the Ubx:Exd complex has fixed spacing, Hth can have variable spacing relative to Ubx:Exd in the trimer). It does so by training on SELEX-seq data for each factor, as well as assays in which two or more of the factors were present.
import torch
import pyprobound
import pyprobound.plotting
Data specification
alphabet = pyprobound.alphabets.DNA()
url = "http://pbdemo.x3dna.org/files/example_data/hthExdUbx/"
dataframe_UbxExdHth = pyprobound.get_dataframe(
f"{url}countTable.0.UbxIVa-Hth-Exd.30mer1.tsv.gz"
)
dataframe_UbxExd = pyprobound.get_dataframe(
f"{url}countTable.1.UbxIVa-Exd.16mer1_rep1.tsv.gz"
)
dataframe_Ubx = pyprobound.get_dataframe(
f"{url}countTable.2.UbxIVa.16mer1_rep1.tsv.gz"
)
dataframe_Exd = pyprobound.get_dataframe(f"{url}countTable.3.Exd.tsv.gz")
dataframe_Hth = pyprobound.get_dataframe(
f"{url}countTable.4.Hth.16mer2_rep1.tsv.gz"
)
count_table_UbxExdHth = pyprobound.CountTable( # varlen=30
dataframe_UbxExdHth,
alphabet,
left_flank="GTTCAGAGTTCTACAGTCCGACGATC",
right_flank="CCCGGGTCGTATGCCGTCTTCTGCTTG",
left_flank_length=7,
right_flank_length=7,
)
count_table_UbxExd = pyprobound.CountTable( # varlen=16
dataframe_UbxExd,
alphabet,
left_flank="GTTCAGAGTTCTACAGTCCGACGATCTGG",
right_flank="CCAGCTGTCGTATGCCGTCTTCTGCTTG",
left_flank_length=7,
right_flank_length=7,
)
count_table_Ubx = pyprobound.CountTable( # varlen=16
dataframe_Ubx,
alphabet,
left_flank="GTTCAGAGTTCTACAGTCCGACGATCTGG",
right_flank="CCAGCTGTCGTATGCCGTCTTCTGCTTG",
left_flank_length=5,
right_flank_length=5,
)
count_table_Exd = pyprobound.CountTable( # varlen=16
dataframe_Exd,
alphabet,
left_flank="TGGGCCTGG",
right_flank="CCAGG",
left_flank_length=5,
right_flank_length=5,
)
count_table_Hth = pyprobound.CountTable( # varlen=16
dataframe_Hth,
alphabet,
left_flank="GTTCAGAGTTCTACAGTCCGACGATCTGG",
right_flank="CCACGTCTCGTATGCCGTCTTCTGCTTG",
left_flank_length=5,
right_flank_length=5,
)
count_tables = [
count_table_UbxExdHth,
count_table_UbxExd,
count_table_Ubx,
count_table_Exd,
count_table_Hth,
]
Model specification
PSAMs
nonspecific = pyprobound.layers.NonSpecific(alphabet=alphabet)
psam_ExdUbx = pyprobound.layers.PSAM(
kernel_size=13,
alphabet=alphabet,
seed=["NATGATTTATGAN"],
seed_scale=6,
name="ExdUbx",
)
psam_Exd = pyprobound.layers.PSAM(
kernel_size=8,
alphabet=alphabet,
seed=["NTTGAYRN"],
seed_scale=6,
name="Exd",
)
psam_Ubx = pyprobound.layers.PSAM(
kernel_size=8,
alphabet=alphabet,
seed=["NTTATGGN"],
seed_scale=6,
name="Ubx",
)
psam_Hth = pyprobound.layers.PSAM(
kernel_size=8,
alphabet=alphabet,
seed=["NNTGAYRN"],
seed_scale=6,
name="Hth",
)
spacing_ExdUbx_Hth = pyprobound.Spacing(
[psam_ExdUbx], [psam_Hth], max_overlap=7
)
psams = [psam_ExdUbx, psam_Exd, psam_Ubx, psam_Hth]
Modes
modes_UbxExdHth = [
pyprobound.Mode.from_nonspecific(nonspecific, count_table_UbxExdHth)
] + [
pyprobound.Mode.from_psam(psam, count_table_UbxExdHth)
for psam in (psam_ExdUbx, psam_Exd, psam_Ubx, psam_Hth)
]
modes_UbxExdHth.append(
pyprobound.Cooperativity(
spacing_ExdUbx_Hth, modes_UbxExdHth[1], modes_UbxExdHth[4]
)
)
modes_UbxExd = [
pyprobound.Mode.from_nonspecific(nonspecific, count_table_UbxExd)
] + [
pyprobound.Mode.from_psam(psam, count_table_UbxExd)
for psam in (psam_ExdUbx, psam_Exd, psam_Ubx)
]
modes_Ubx = [
pyprobound.Mode.from_nonspecific(nonspecific, count_table_Ubx),
pyprobound.Mode.from_psam(psam_Ubx, count_table_Ubx),
]
modes_Exd = [
pyprobound.Mode.from_nonspecific(nonspecific, count_table_Exd),
pyprobound.Mode.from_psam(psam_Exd, count_table_Exd),
]
modes_Hth = [
pyprobound.Mode.from_nonspecific(nonspecific, count_table_Hth),
pyprobound.Mode.from_psam(psam_Hth, count_table_Hth),
]
Rounds
rounds_UbxExdHth = pyprobound.rounds.repeat_round(
modes_UbxExdHth, 4, pyprobound.rounds.BoundUnsaturatedRound
)
rounds_UbxExd = pyprobound.rounds.repeat_round(
modes_UbxExd, 4, pyprobound.rounds.BoundUnsaturatedRound
)
rounds_Ubx = pyprobound.rounds.repeat_round(
modes_Ubx, 3, pyprobound.rounds.BoundUnsaturatedRound
)
rounds_Exd = pyprobound.rounds.repeat_round(
modes_Exd, 2, pyprobound.rounds.BoundUnsaturatedRound
)
rounds_Hth = pyprobound.rounds.repeat_round(
modes_Hth, 2, pyprobound.rounds.BoundUnsaturatedRound
)
Experiments
experiments = [
pyprobound.Experiment(rounds_UbxExdHth, name="UbxExdHth"),
pyprobound.Experiment(rounds_UbxExd, name="UbxExd"),
pyprobound.Experiment(rounds_Ubx, name="Ubx"),
pyprobound.Experiment(rounds_Exd, name="Exd"),
pyprobound.Experiment(rounds_Hth, name="Hth"),
]
Model
model = pyprobound.MultiExperimentLoss(experiments, pseudocount=0)
Fitting
optimizer = pyprobound.Optimizer(
model,
count_tables,
device="cpu",
checkpoint="UbxExdHth.pt",
output="UbxExdHth.txt",
)
optimizer.train_sequential()
tensor(0.7766)
optimizer.reload()
{'time': 'Wed Apr 24 01:08:53 2024',
'version': '1.3.1',
'flank_lengths': ((7, 7), (5, 5), (5, 5), (5, 5), (5, 5))}
Loss
with torch.inference_mode():
loss, reg = model(count_tables)
print(loss, reg, loss + reg)
tensor(0.7763) tensor(0.0003) tensor(0.7766)
























