This example is taken from the ProBound web server.
This example produces a single CTCF binding model by training on SMiLE-seq data.
import torch
import pyprobound
import pyprobound.plotting
Data specification
alphabet = pyprobound.alphabets.DNA()
dataframe = pyprobound.get_dataframe(
"http://pbdemo.x3dna.org/files/example_data/"
"singleTF/countTable.0.CTCF_r3.tsv.gz"
) # SMiLE-seq count table generated from Isakova et al. (2017)
dataframe.head()
| 1 | 2 | |
|---|---|---|
| 0 | ||
| AAAAAAAGCCCGGAAATAGGCAACTTGTAG | 0 | 1 |
| AAAAAAAGGATGTTCCTAGCAACTTATAAA | 1 | 0 |
| AAAAAACAACGATAACCAACTGCTGCCGGA | 0 | 1 |
| AAAAAACACATGTATGAGTTTTTGATGGAG | 1 | 0 |
| AAAAAACCCTCCTTGGTGTCGGACGGCTAT | 0 | 1 |
count_table = pyprobound.CountTable(
dataframe,
alphabet,
left_flank="ACACTCTTTCCCTACACGACGCTCTTCCGATCTTGACGTC",
right_flank="GACGTCAGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG",
left_flank_length=5,
right_flank_length=5,
)
Model specification
PSAMs
nonspecific = pyprobound.layers.NonSpecific(alphabet=alphabet, name="NS")
psam_1 = pyprobound.layers.PSAM(
kernel_size=12,
alphabet=alphabet,
max_kernel_size=18,
increment_flank=True,
shift_footprint_heuristic=True,
increment_footprint=True,
increment_flank_with_footprint=True,
)
psam_2 = pyprobound.layers.PSAM(
kernel_size=12,
alphabet=alphabet,
max_kernel_size=18,
increment_flank=True,
shift_footprint_heuristic=True,
increment_footprint=True,
increment_flank_with_footprint=True,
)
psams = [psam_1, psam_2]
Modes
modes = [
pyprobound.Mode.from_nonspecific(nonspecific, count_table),
pyprobound.Mode.from_psam(psam_1, count_table),
pyprobound.Mode.from_psam(psam_2, count_table),
]
Rounds
round_0 = pyprobound.rounds.InitialRound()
round_1 = pyprobound.rounds.BoundUnsaturatedRound.from_binding(modes, round_0)
Experiment
experiment = pyprobound.Experiment(
[round_0, round_1],
name="CTCF",
counts_per_round=count_table.counts_per_round,
)
Model
model = pyprobound.MultiExperimentLoss([experiment], pseudocount=20)
Fitting
optimizer = pyprobound.Optimizer(
model,
[count_table],
greedy_threshold=2e-4,
device="cpu",
checkpoint="CTCF.pt",
output="CTCF.txt",
)
optimizer.train_sequential()
tensor(0.6677)
optimizer.reload()
{'time': 'Tue Apr 23 19:22:06 2024',
'version': '1.3.1',
'flank_lengths': ((6, 6),)}
Loss
with torch.inference_mode():
loss, reg = model([count_table])
print(loss, reg, loss + reg)
tensor(0.6426) tensor(0.0251) tensor(0.6677)






