This example is taken from the ProBound web server, and corresponds to Figure 3a-b in the original Nature Biotech publication.

This example produces models for three Drosophila homeodomain proteins (Ubx, Exd, and Hth) as well as the binding cooperativity of the trimer (while the Ubx:Exd complex has fixed spacing, Hth can have variable spacing relative to Ubx:Exd in the trimer). It does so by training on SELEX-seq data for each factor, as well as assays in which two or more of the factors were present.

import torch

import pyprobound
import pyprobound.plotting

Data specification

alphabet = pyprobound.alphabets.DNA()
url = "http://pbdemo.x3dna.org/files/example_data/hthExdUbx/"
dataframe_UbxExdHth = pyprobound.get_dataframe(
    f"{url}countTable.0.UbxIVa-Hth-Exd.30mer1.tsv.gz"
)
dataframe_UbxExd = pyprobound.get_dataframe(
    f"{url}countTable.1.UbxIVa-Exd.16mer1_rep1.tsv.gz"
)
dataframe_Ubx = pyprobound.get_dataframe(
    f"{url}countTable.2.UbxIVa.16mer1_rep1.tsv.gz"
)
dataframe_Exd = pyprobound.get_dataframe(f"{url}countTable.3.Exd.tsv.gz")
dataframe_Hth = pyprobound.get_dataframe(
    f"{url}countTable.4.Hth.16mer2_rep1.tsv.gz"
)
count_table_UbxExdHth = pyprobound.CountTable(  # varlen=30
    dataframe_UbxExdHth,
    alphabet,
    left_flank="GTTCAGAGTTCTACAGTCCGACGATC",
    right_flank="CCCGGGTCGTATGCCGTCTTCTGCTTG",
    left_flank_length=7,
    right_flank_length=7,
)
count_table_UbxExd = pyprobound.CountTable(  # varlen=16
    dataframe_UbxExd,
    alphabet,
    left_flank="GTTCAGAGTTCTACAGTCCGACGATCTGG",
    right_flank="CCAGCTGTCGTATGCCGTCTTCTGCTTG",
    left_flank_length=7,
    right_flank_length=7,
)
count_table_Ubx = pyprobound.CountTable(  # varlen=16
    dataframe_Ubx,
    alphabet,
    left_flank="GTTCAGAGTTCTACAGTCCGACGATCTGG",
    right_flank="CCAGCTGTCGTATGCCGTCTTCTGCTTG",
    left_flank_length=5,
    right_flank_length=5,
)
count_table_Exd = pyprobound.CountTable(  # varlen=16
    dataframe_Exd,
    alphabet,
    left_flank="TGGGCCTGG",
    right_flank="CCAGG",
    left_flank_length=5,
    right_flank_length=5,
)
count_table_Hth = pyprobound.CountTable(  # varlen=16
    dataframe_Hth,
    alphabet,
    left_flank="GTTCAGAGTTCTACAGTCCGACGATCTGG",
    right_flank="CCACGTCTCGTATGCCGTCTTCTGCTTG",
    left_flank_length=5,
    right_flank_length=5,
)
count_tables = [
    count_table_UbxExdHth,
    count_table_UbxExd,
    count_table_Ubx,
    count_table_Exd,
    count_table_Hth,
]

Model specification

PSAMs

nonspecific = pyprobound.layers.NonSpecific(alphabet=alphabet)
psam_ExdUbx = pyprobound.layers.PSAM(
    kernel_size=13,
    alphabet=alphabet,
    seed=["NATGATTTATGAN"],
    seed_scale=6,
    name="ExdUbx",
)
psam_Exd = pyprobound.layers.PSAM(
    kernel_size=8,
    alphabet=alphabet,
    seed=["NTTGAYRN"],
    seed_scale=6,
    name="Exd",
)
psam_Ubx = pyprobound.layers.PSAM(
    kernel_size=8,
    alphabet=alphabet,
    seed=["NTTATGGN"],
    seed_scale=6,
    name="Ubx",
)
psam_Hth = pyprobound.layers.PSAM(
    kernel_size=8,
    alphabet=alphabet,
    seed=["NNTGAYRN"],
    seed_scale=6,
    name="Hth",
)
spacing_ExdUbx_Hth = pyprobound.Spacing(
    [psam_ExdUbx], [psam_Hth], max_overlap=7
)
psams = [psam_ExdUbx, psam_Exd, psam_Ubx, psam_Hth]

Modes

modes_UbxExdHth = [
    pyprobound.Mode.from_nonspecific(nonspecific, count_table_UbxExdHth)
] + [
    pyprobound.Mode.from_psam(psam, count_table_UbxExdHth)
    for psam in (psam_ExdUbx, psam_Exd, psam_Ubx, psam_Hth)
]
modes_UbxExdHth.append(
    pyprobound.Cooperativity(
        spacing_ExdUbx_Hth, modes_UbxExdHth[1], modes_UbxExdHth[4]
    )
)
modes_UbxExd = [
    pyprobound.Mode.from_nonspecific(nonspecific, count_table_UbxExd)
] + [
    pyprobound.Mode.from_psam(psam, count_table_UbxExd)
    for psam in (psam_ExdUbx, psam_Exd, psam_Ubx)
]
modes_Ubx = [
    pyprobound.Mode.from_nonspecific(nonspecific, count_table_Ubx),
    pyprobound.Mode.from_psam(psam_Ubx, count_table_Ubx),
]
modes_Exd = [
    pyprobound.Mode.from_nonspecific(nonspecific, count_table_Exd),
    pyprobound.Mode.from_psam(psam_Exd, count_table_Exd),
]
modes_Hth = [
    pyprobound.Mode.from_nonspecific(nonspecific, count_table_Hth),
    pyprobound.Mode.from_psam(psam_Hth, count_table_Hth),
]

Rounds

rounds_UbxExdHth = pyprobound.rounds.repeat_round(
    modes_UbxExdHth, 4, pyprobound.rounds.BoundUnsaturatedRound
)
rounds_UbxExd = pyprobound.rounds.repeat_round(
    modes_UbxExd, 4, pyprobound.rounds.BoundUnsaturatedRound
)
rounds_Ubx = pyprobound.rounds.repeat_round(
    modes_Ubx, 3, pyprobound.rounds.BoundUnsaturatedRound
)
rounds_Exd = pyprobound.rounds.repeat_round(
    modes_Exd, 2, pyprobound.rounds.BoundUnsaturatedRound
)
rounds_Hth = pyprobound.rounds.repeat_round(
    modes_Hth, 2, pyprobound.rounds.BoundUnsaturatedRound
)

Experiments

experiments = [
    pyprobound.Experiment(rounds_UbxExdHth, name="UbxExdHth"),
    pyprobound.Experiment(rounds_UbxExd, name="UbxExd"),
    pyprobound.Experiment(rounds_Ubx, name="Ubx"),
    pyprobound.Experiment(rounds_Exd, name="Exd"),
    pyprobound.Experiment(rounds_Hth, name="Hth"),
]

Model

model = pyprobound.MultiExperimentLoss(experiments, pseudocount=0)

Fitting

optimizer = pyprobound.Optimizer(
    model,
    count_tables,
    device="cpu",
    checkpoint="UbxExdHth.pt",
    output="UbxExdHth.txt",
)
optimizer.train_sequential()
tensor(0.7766)
optimizer.reload()
{'time': 'Wed Apr 24 01:08:53 2024',
 'version': '1.3.1',
 'flank_lengths': ((7, 7), (5, 5), (5, 5), (5, 5), (5, 5))}

Loss

with torch.inference_mode():
    loss, reg = model(count_tables)
    print(loss, reg, loss + reg)
tensor(0.7763) tensor(0.0003) tensor(0.7766)

Cooperativity

pyprobound.plotting.spacing(modes_UbxExdHth[-1])
../_images/2625fb92b83ed32858dd7c54a66294815ef9a028f5f29522eade14fabe40c227.png
pyprobound.plotting.cooperativity(modes_UbxExdHth[-1])
../_images/5a0bc42893eafb0951e1b93a22b7aa6e7163bdeb15d2667db45fafb3000c19f9.png

Probe enrichment

for experiment, count_table in zip(experiments, count_tables):
    pyprobound.plotting.probe_enrichment(experiment, count_table)
../_images/946bd35e9d59c1b0d0ed24268d14caba20063e966484258ae946d47f1b7a86a2.png ../_images/47c11794e41b3dabf3f94d26d3bb30584423fc7ab6eb11b15de414046fbb9451.png ../_images/61be031dba6043be6f7548cc52c3f44348b4589fc933caec214c6ab2828cd992.png ../_images/97d6170ab40a43d89f97df35b604c91a0e0258dc47dd89c2aca6a36ff0492be0.png ../_images/22063a3c1371bbbba81fb8fd5802f9884c324e34d8f24a080be382c51fd300f4.png

Mode contribution

for experiment, count_table in zip(experiments, count_tables):
    for rnd in experiment.observed_rounds[1:]:
        pyprobound.plotting.contribution(rnd, count_table)
../_images/c128028e4a79523fc2348634284b43439aed344fe6f7a584a083613b14031f6a.png ../_images/c17983711a2cf58b64c9ba2a624a3b8b73ae5869783e07d24c167116dcd87357.png ../_images/b8254693a7cc54561158687d2ff8a38131bb88ce6517f8d2dfdd02c4c0d20e21.png ../_images/2d6f26679c6e73a34c295f43dc6b3ac868c1de362fbd6d58e66ba07188ed0a06.png ../_images/02bb80bc08bad40f234d0528a2fb92030906f60bebcf5700b62386122d11df34.png ../_images/a66f1847b39fc8140049c686819cd839837238e3c705c658662957f4dcde21f2.png ../_images/b233c5c64389a6834b63153ccd867d0cf1d7e778973d167b372a773fd4fc67e0.png ../_images/c56c6288da8e4dc09e9365ec28b383009155cfbc0e2d659edcf6bc2bdef205a4.png ../_images/e50e5571330f7b3e321399f6eb1e155bf6e93f54deac25b535604fdaabaa043e.png ../_images/0a59b4311760dc3fa7e90386e01b6b157c2c8ebc795ca96cefb1d63ad4f5e0fc.png