kinactive.config module

Configuration dataclasses for the database, matrix and io.

class kinactive.config.DBConfig(verbose: bool = True, target_dir: Path = PosixPath('db'), pdb_dir: Path = PosixPath('pdb/structures'), pdb_dir_info: Path = PosixPath('pdb/info'), seq_dir: Path = PosixPath('uniprot/fasta'), max_fetch_trials: int = 2, io_cpus: int = 1, init_cpus: int = 1, init_map_numbering_cpus: int = 1, init_add_structure_cpus: int = 1, init_tolerate_failures: bool = True, profile: Path = PosixPath('/home/docs/checkouts/readthedocs.org/user_builds/kinactive/checkouts/latest/kinactive/resources/PF00069.hmm'), tk2pk: Path = PosixPath('/home/docs/checkouts/readthedocs.org/user_builds/kinactive/checkouts/latest/kinactive/resources/tk2pk.json'), pk_map_name: str = 'PK', pk_min_score: float = 50, pk_min_seq_domain_size: int = 150, pk_min_str_domain_size: int = 100, pk_min_cov_hmm: float = 0.5, pk_min_cov_seq: float = 0.5, pk_min_str_seq_match: float = 0.8, min_seq_size: int = 150, max_seq_size: int = 5000, pdb_fmt: str = 'mmtf.gz', pdb_num_fetch_threads: int = 10, pdb_str_min_size: int = 100, uniprot_chunk_size: int = 100, uniprot_num_fetch_threads: int = 10)[source]

Bases: object

Database config.

Default parameters were used to create lXt-PK data collection. To reproduce locally, you may change the paths (*_dir*) and adjust the number of cpus (*_cpus).

__init__(verbose: bool = True, target_dir: Path = PosixPath('db'), pdb_dir: Path = PosixPath('pdb/structures'), pdb_dir_info: Path = PosixPath('pdb/info'), seq_dir: Path = PosixPath('uniprot/fasta'), max_fetch_trials: int = 2, io_cpus: int = 1, init_cpus: int = 1, init_map_numbering_cpus: int = 1, init_add_structure_cpus: int = 1, init_tolerate_failures: bool = True, profile: Path = PosixPath('/home/docs/checkouts/readthedocs.org/user_builds/kinactive/checkouts/latest/kinactive/resources/PF00069.hmm'), tk2pk: Path = PosixPath('/home/docs/checkouts/readthedocs.org/user_builds/kinactive/checkouts/latest/kinactive/resources/tk2pk.json'), pk_map_name: str = 'PK', pk_min_score: float = 50, pk_min_seq_domain_size: int = 150, pk_min_str_domain_size: int = 100, pk_min_cov_hmm: float = 0.5, pk_min_cov_seq: float = 0.5, pk_min_str_seq_match: float = 0.8, min_seq_size: int = 150, max_seq_size: int = 5000, pdb_fmt: str = 'mmtf.gz', pdb_num_fetch_threads: int = 10, pdb_str_min_size: int = 100, uniprot_chunk_size: int = 100, uniprot_num_fetch_threads: int = 10) None
init_add_structure_cpus: int = 1

#cpus for adding structures to a chain. Valid if init_map_numbering_cpus is > 1.

init_cpus: int = 1

#cpus for ChainInitializer (10-20 usually works fine).

init_map_numbering_cpus: int = 1

#cpus for pairwise sequence alignments. Increase to max number possible.

init_tolerate_failures: bool = True

Tolerate initialization failures. Should be True unless testing

io_cpus: int = 1

#cpus for ChainIO (10-20 usually works fine).

max_fetch_trials: int = 2

max trials for fetching an entry from external resources.

max_seq_size: int = 5000

maximum sequence size to filter raw sequences from UniProt.

min_seq_size: int = 150

minimum sequence size to filter raw sequences from UniProt.

pdb_dir: Path = PosixPath('pdb/structures')

raw PDB structures.

pdb_dir_info: Path = PosixPath('pdb/info')

info on PDB structures.

pdb_fmt: str = 'mmtf.gz'

PDB files format.

pdb_num_fetch_threads: int = 10

The number of threads to use when fetching data from the PDB.

pdb_str_min_size: int = 100

The minimum structure size (in residues) to filter raw structures.

pk_map_name: str = 'PK'

the domain name to use for extraction.

pk_min_cov_hmm: float = 0.5

min coverage of the hmm nodes.

pk_min_cov_seq: float = 0.5

min coverage of the sequence.

pk_min_score: float = 50

a minimum BitScore to qualify for hit.

pk_min_seq_domain_size: int = 150

min domain size for canonical sequences.

pk_min_str_domain_size: int = 100

min domain size for structure sequences.

pk_min_str_seq_match: float = 0.8

min matching residues’ fraction between structure and canonical sequences.

profile: Path = PosixPath('/home/docs/checkouts/readthedocs.org/user_builds/kinactive/checkouts/latest/kinactive/resources/PF00069.hmm')

A path to the PK profile (supplied with the package)

seq_dir: Path = PosixPath('uniprot/fasta')

raw UniProt sequences.

target_dir: Path = PosixPath('db')

database dump path.

tk2pk: Path = PosixPath('/home/docs/checkouts/readthedocs.org/user_builds/kinactive/checkouts/latest/kinactive/resources/tk2pk.json')

A map between TK and PK profile nodes

uniprot_chunk_size: int = 100

The chunk size to split UniProt ids into when fetching the data from UniProt.

uniprot_num_fetch_threads: int = 10

The number of threads to use when fetching the data from UniProt.

verbose: bool = True

progress bar output.

class kinactive.config.MatrixConfig(dir: Path = PosixPath('clustering'), n_super_pos: int = 30, pk_map_name: str = 'PK', n_proc: int | None = None, chunksize: int = 5000, df_pos: tuple[int, int] = (141, 142), bb_atoms: tuple[str, ...] = ('CA',), phe_atoms: tuple[str, ...] = ('CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ'), asp_atoms: tuple[str, ...] = ('CA', 'CB', 'CG', 'OD1', 'OD2'))[source]

Bases: object

The superposition-based matrix configuration. This matrix is used to compute

__init__(dir: Path = PosixPath('clustering'), n_super_pos: int = 30, pk_map_name: str = 'PK', n_proc: int | None = None, chunksize: int = 5000, df_pos: tuple[int, int] = (141, 142), bb_atoms: tuple[str, ...] = ('CA',), phe_atoms: tuple[str, ...] = ('CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ'), asp_atoms: tuple[str, ...] = ('CA', 'CB', 'CG', 'OD1', 'OD2')) None
asp_atoms: tuple[str, ...] = ('CA', 'CB', 'CG', 'OD1', 'OD2')

DFG-Asp atom names used for RMSD computation.

bb_atoms: tuple[str, ...] = ('CA',)

Backbone atom names used for superposing.

chunksize: int = 5000

The chunk size for distributing data between processes.

df_pos: tuple[int, int] = (141, 142)

DFG-Asp/Phe positions.

dir: Path = PosixPath('clustering')

Path to dump the results.

n_proc: int | None = None

The number of cpus to use for parallel computation. Adjust carefully.

n_super_pos: int = 30

The number of the most covered HMM nodes to use for superposing.

phe_atoms: tuple[str, ...] = ('CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ')

DFG-Phe atom names used for RMSD computation.

pk_map_name: str = 'PK'

The PK domain name. Should be the same as used in kinactive.db.DB.