This example is taken from the ProBound web server, and corresponds to Figure 5 in the original Nature Biotech publication.
This example produces a model of GR binding that additionally learns the binding models of cofactors such as AP1. It does so by training on single-end ChIP-seq data, in which reads were mapped and extended by 200bp.
import torch
import pyprobound
import pyprobound.plotting
Data specification
alphabet = pyprobound.alphabets.DNA()
dataframe = pyprobound.get_dataframe(
"http://pbdemo.x3dna.org/files/example_data/"
"ChIP-single/countTable.0.IMR90_GR_chip-seq_rep1.tsv.gz"
) # IMR90 ChIP-seq count table generated from Starick et al. (2015)
dataframe.head()
| 1 | 2 | |
|---|---|---|
| 0 | ||
| AAAAAAAAAAAAAAAAAATGTTGCTTAACTGGTTTGGTTGTTTCAGCTGGCTACTACCAAGAGAAGTAAACAGGACCTTATCATTTCAGTTAAGTCTGGCTTGAGCATTATTTTCAAGTAAATATTTATAATTTCAATGCTATTCCATTGCTACCACTGCTGTCTTTGGATTGGACCGACTGAAACATTGTCTTGGATTA | 0 | 1 |
| AAAAAAAAAAAAAAGGCCAGCCACCGCGCAGGGAGCCCAGCCCGTGCAGCCCGGAGTCCAGCAGCGACTGGCCCAGAGCAGGGTCCGCGCGCTCGGCCGGCCCGCAGGGAGGAGGGGGCGCGGCTGGGTCGGGCGTGCAGCGGCAGCAAGGAAGGCGGCCTGGGGTTCGCGCTTGGGGCTTCTGCTTTTTCACCATTGCA | 1 | 0 |
| AAAAAAAAAAAAATAACAAGTAGGGACAGGGATTCCTGGGATGGTGCTCATTATGGGTGTCAGGCTGAGTAGAGCTGGCACAGGCCTTGGTTTGTAAACACAGGGCAGAACGAGCATTACCTAAGAGCGCTTTGCTCCTGCACATCCCAAAGAACCAGGCAGTCACTACAAGTGGAAGCTCAAAGAACATGCACTCAAGT | 0 | 1 |
| AAAAAAAAAAAAATGTGTGTAACAAAATATGCACTATAGAGGATCAAACTGTTTTACAAAATCTTCAAAAACCTAAATGTTTTCTTAGCAAATGATGGCATGTCCTTCAAACACTAGAACACTATATTGTTTATTTTGCTTTCAGGCTGATTATAAGGTAAATTCAACTCAGAGAGACTTGATCTTCTTCCTAATTTACC | 1 | 0 |
| AAAAAAAAAAAACAGAGAAAATACCAGGGCTGAAAGGTACAGCATCTCCAAATGTTGCAACTTCATTAGGCAAAGTCTTAAAAAAAAGCTTGTTGACAGGTGGCACCCAGGAAAAATAAAATAAAATGCCCCTTGGTTGGCATCCTTCCCCTCACAGGGCTCTGAAGCTCTTCCTGAACATAAAGCCAAAGTTAACACTG | 0 | 1 |
count_table = pyprobound.CountTable(dataframe, alphabet)
Model specification
PSAMs
nonspecific = pyprobound.layers.NonSpecific(alphabet=alphabet, name="NS")
psams = [
pyprobound.layers.PSAM(
kernel_size=15,
alphabet=alphabet,
seed=["AG*ACA**-------"],
seed_scale=6,
symmetry=[1, 2, 3, 4, 5, 6, 7, 8, -7, -6, -5, -4, -3, -2, -1],
name="GR",
)
] + [
pyprobound.layers.PSAM(
kernel_size=10,
alphabet=alphabet,
max_kernel_size=18,
shift_footprint_heuristic=True,
increment_footprint=True,
)
for _ in range(3)
]
Modes
modes = [pyprobound.Mode.from_nonspecific(nonspecific, count_table)] + [
pyprobound.Mode.from_psam(psam, count_table) for psam in psams
]
Rounds
round_0 = pyprobound.rounds.InitialRound()
round_1 = pyprobound.rounds.BoundUnsaturatedRound.from_binding(modes, round_0)
Experiments
experiment = pyprobound.Experiment(
[round_0, round_1],
name="GR",
counts_per_round=count_table.counts_per_round,
)
Model
model = pyprobound.MultiExperimentLoss([experiment], pseudocount=20)
Fitting
optimizer = pyprobound.Optimizer(
model,
[count_table],
greedy_threshold=2e-4,
device="cpu",
checkpoint="GR.pt",
output="GR.txt",
)
optimizer.train_sequential()
tensor(0.6463)
optimizer.reload()
{'time': 'Tue Apr 23 18:29:42 2024',
'version': '1.3.1',
'flank_lengths': ((0, 0),)}
Loss
with torch.inference_mode():
loss, reg = model([count_table])
print(loss, reg, loss + reg)
tensor(0.6199) tensor(0.0264) tensor(0.6463)













