This example is taken from the ProBound web server.

This example produces a single CTCF binding model by training on SMiLE-seq data.

import torch

import pyprobound
import pyprobound.plotting

Data specification

alphabet = pyprobound.alphabets.DNA()

dataframe = pyprobound.get_dataframe(
    "http://pbdemo.x3dna.org/files/example_data/"
    "singleTF/countTable.0.CTCF_r3.tsv.gz"
)  # SMiLE-seq count table generated from Isakova et al. (2017)
dataframe.head()

	1	2
0
AAAAAAAGCCCGGAAATAGGCAACTTGTAG	0	1
AAAAAAAGGATGTTCCTAGCAACTTATAAA	1	0
AAAAAACAACGATAACCAACTGCTGCCGGA	0	1
AAAAAACACATGTATGAGTTTTTGATGGAG	1	0
AAAAAACCCTCCTTGGTGTCGGACGGCTAT	0	1

count_table = pyprobound.CountTable(
    dataframe,
    alphabet,
    left_flank="ACACTCTTTCCCTACACGACGCTCTTCCGATCTTGACGTC",
    right_flank="GACGTCAGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG",
    left_flank_length=5,
    right_flank_length=5,
)

Model specification

PSAMs

nonspecific = pyprobound.layers.NonSpecific(alphabet=alphabet, name="NS")
psam_1 = pyprobound.layers.PSAM(
    kernel_size=12,
    alphabet=alphabet,
    max_kernel_size=18,
    increment_flank=True,
    shift_footprint_heuristic=True,
    increment_footprint=True,
    increment_flank_with_footprint=True,
)
psam_2 = pyprobound.layers.PSAM(
    kernel_size=12,
    alphabet=alphabet,
    max_kernel_size=18,
    increment_flank=True,
    shift_footprint_heuristic=True,
    increment_footprint=True,
    increment_flank_with_footprint=True,
)
psams = [psam_1, psam_2]

Modes

modes = [
    pyprobound.Mode.from_nonspecific(nonspecific, count_table),
    pyprobound.Mode.from_psam(psam_1, count_table),
    pyprobound.Mode.from_psam(psam_2, count_table),
]

Rounds

round_0 = pyprobound.rounds.InitialRound()
round_1 = pyprobound.rounds.BoundUnsaturatedRound.from_binding(modes, round_0)

Experiment

experiment = pyprobound.Experiment(
    [round_0, round_1],
    name="CTCF",
    counts_per_round=count_table.counts_per_round,
)

Model

model = pyprobound.MultiExperimentLoss([experiment], pseudocount=20)

Fitting

optimizer = pyprobound.Optimizer(
    model,
    [count_table],
    greedy_threshold=2e-4,
    device="cpu",
    checkpoint="CTCF.pt",
    output="CTCF.txt",
)

optimizer.train_sequential()

tensor(0.6677)

optimizer.reload()

{'time': 'Tue Apr 23 19:22:06 2024',
 'version': '1.3.1',
 'flank_lengths': ((6, 6),)}

Loss

with torch.inference_mode():
    loss, reg = model([count_table])
    print(loss, reg, loss + reg)

tensor(0.6426) tensor(0.0251) tensor(0.6677)

Logo

for psam in psams:
    pyprobound.plotting.logo(psam)
    pyprobound.plotting.logo(psam, reverse=True)

8-mer enrichment

pyprobound.plotting.kmer_enrichment(experiment, count_table, kmer_length=8)

../_images/635d69abbb0dec552e00d3144e0c234ed751e829bf6731d67e409fe97b5d1019.png

Probe enrichment

pyprobound.plotting.probe_enrichment(experiment, count_table)

../_images/ba9c3ebf427373952ef71f08b1af9a82780b2f4f6df9a1433b1f798755e0cda3.png

Mode contribution

pyprobound.plotting.contribution(round_1, count_table)

../_images/bca8ae0c1a5c78769a99c3b589492b5f34d6bf64df946712f9f6d9b53810af15.png