This example is taken from the ProBound web server, and corresponds to Figure 5 in the original Nature Biotech publication.

This example produces a model of GR binding that additionally learns the binding models of cofactors such as AP1. It does so by training on single-end ChIP-seq data, in which reads were mapped and extended by 200bp.

import torch

import pyprobound
import pyprobound.plotting

Data specification

alphabet = pyprobound.alphabets.DNA()
dataframe = pyprobound.get_dataframe(
    "http://pbdemo.x3dna.org/files/example_data/"
    "ChIP-single/countTable.0.IMR90_GR_chip-seq_rep1.tsv.gz"
)  # IMR90 ChIP-seq count table generated from Starick et al. (2015)
dataframe.head()
1 2
0
AAAAAAAAAAAAAAAAAATGTTGCTTAACTGGTTTGGTTGTTTCAGCTGGCTACTACCAAGAGAAGTAAACAGGACCTTATCATTTCAGTTAAGTCTGGCTTGAGCATTATTTTCAAGTAAATATTTATAATTTCAATGCTATTCCATTGCTACCACTGCTGTCTTTGGATTGGACCGACTGAAACATTGTCTTGGATTA 0 1
AAAAAAAAAAAAAAGGCCAGCCACCGCGCAGGGAGCCCAGCCCGTGCAGCCCGGAGTCCAGCAGCGACTGGCCCAGAGCAGGGTCCGCGCGCTCGGCCGGCCCGCAGGGAGGAGGGGGCGCGGCTGGGTCGGGCGTGCAGCGGCAGCAAGGAAGGCGGCCTGGGGTTCGCGCTTGGGGCTTCTGCTTTTTCACCATTGCA 1 0
AAAAAAAAAAAAATAACAAGTAGGGACAGGGATTCCTGGGATGGTGCTCATTATGGGTGTCAGGCTGAGTAGAGCTGGCACAGGCCTTGGTTTGTAAACACAGGGCAGAACGAGCATTACCTAAGAGCGCTTTGCTCCTGCACATCCCAAAGAACCAGGCAGTCACTACAAGTGGAAGCTCAAAGAACATGCACTCAAGT 0 1
AAAAAAAAAAAAATGTGTGTAACAAAATATGCACTATAGAGGATCAAACTGTTTTACAAAATCTTCAAAAACCTAAATGTTTTCTTAGCAAATGATGGCATGTCCTTCAAACACTAGAACACTATATTGTTTATTTTGCTTTCAGGCTGATTATAAGGTAAATTCAACTCAGAGAGACTTGATCTTCTTCCTAATTTACC 1 0
AAAAAAAAAAAACAGAGAAAATACCAGGGCTGAAAGGTACAGCATCTCCAAATGTTGCAACTTCATTAGGCAAAGTCTTAAAAAAAAGCTTGTTGACAGGTGGCACCCAGGAAAAATAAAATAAAATGCCCCTTGGTTGGCATCCTTCCCCTCACAGGGCTCTGAAGCTCTTCCTGAACATAAAGCCAAAGTTAACACTG 0 1
count_table = pyprobound.CountTable(dataframe, alphabet)

Model specification

PSAMs

nonspecific = pyprobound.layers.NonSpecific(alphabet=alphabet, name="NS")

psams = [
    pyprobound.layers.PSAM(
        kernel_size=15,
        alphabet=alphabet,
        seed=["AG*ACA**-------"],
        seed_scale=6,
        symmetry=[1, 2, 3, 4, 5, 6, 7, 8, -7, -6, -5, -4, -3, -2, -1],
        name="GR",
    )
] + [
    pyprobound.layers.PSAM(
        kernel_size=10,
        alphabet=alphabet,
        max_kernel_size=18,
        shift_footprint_heuristic=True,
        increment_footprint=True,
    )
    for _ in range(3)
]

Modes

modes = [pyprobound.Mode.from_nonspecific(nonspecific, count_table)] + [
    pyprobound.Mode.from_psam(psam, count_table) for psam in psams
]

Rounds

round_0 = pyprobound.rounds.InitialRound()
round_1 = pyprobound.rounds.BoundUnsaturatedRound.from_binding(modes, round_0)

Experiments

experiment = pyprobound.Experiment(
    [round_0, round_1],
    name="GR",
    counts_per_round=count_table.counts_per_round,
)

Model

model = pyprobound.MultiExperimentLoss([experiment], pseudocount=20)

Fitting

optimizer = pyprobound.Optimizer(
    model,
    [count_table],
    greedy_threshold=2e-4,
    device="cpu",
    checkpoint="GR.pt",
    output="GR.txt",
)
optimizer.train_sequential()
tensor(0.6463)
optimizer.reload()
{'time': 'Tue Apr 23 18:29:42 2024',
 'version': '1.3.1',
 'flank_lengths': ((0, 0),)}

Loss

with torch.inference_mode():
    loss, reg = model([count_table])
    print(loss, reg, loss + reg)
tensor(0.6199) tensor(0.0264) tensor(0.6463)

Position bias

for mode in modes[1:]:
    pyprobound.plotting.posbias(mode)
../_images/fb747b3ce97ea882ddc138ad98372bba75ed4f0928c55136570522fe7e2e2f74.png ../_images/84e4cc99c80276bb248337197bb64709def38053eef3c13041c3b2bb8ef97139.png ../_images/f3a00278aee2c5ce9b17af83b60068fbda7ccff3429a8ebcb644c4f91972cbfb.png ../_images/a0de6f3cbe4ebec1ce25dd21dcda3ef43a50f7a2ec9fbab27b583dc589cc8ec8.png

Probe enrichment

pyprobound.plotting.probe_enrichment(experiment, count_table)
../_images/93f6c007bc8e02dd814eb1296d721b786c132710d7c704d1bf86716d37b6bbc7.png

Mode contribution

pyprobound.plotting.contribution(round_1, count_table)
../_images/7f69d222449b7f285905545b25a764504e10ffa85e6f6f176b33f91bbaed0f1a.png