kinactive.config module

Configuration dataclasses for the database, matrix and io.

class kinactive.config.DBConfig(verbose: bool = True, target_dir: Path = PosixPath('db'), pdb_dir: Path = PosixPath('pdb/structures'), pdb_dir_info: Path = PosixPath('pdb/info'), seq_dir: Path = PosixPath('uniprot/fasta'), max_fetch_trials: int = 2, io_cpus: int = 1, init_cpus: int = 1, init_map_numbering_cpus: int = 1, init_add_structure_cpus: int = 1, init_tolerate_failures: bool = True, profile: Path = PosixPath('/home/docs/checkouts/readthedocs.org/user_builds/kinactive/checkouts/latest/kinactive/resources/PF00069.hmm'), tk2pk: Path = PosixPath('/home/docs/checkouts/readthedocs.org/user_builds/kinactive/checkouts/latest/kinactive/resources/tk2pk.json'), pk_map_name: str = 'PK', pk_min_score: float = 50, pk_min_seq_domain_size: int = 150, pk_min_str_domain_size: int = 100, pk_min_cov_hmm: float = 0.5, pk_min_cov_seq: float = 0.5, pk_min_str_seq_match: float = 0.8, min_seq_size: int = 150, max_seq_size: int = 5000, pdb_fmt: str = 'mmtf.gz', pdb_num_fetch_threads: int = 10, pdb_str_min_size: int = 100, uniprot_chunk_size: int = 100, uniprot_num_fetch_threads: int = 10)[source]

Bases: object

Database config.

Default parameters were used to create lXt-PK data collection. To reproduce locally, you may change the paths (*_dir*) and adjust the number of cpus (*_cpus).

__init__(verbose: bool = True, target_dir: Path = PosixPath('db'), pdb_dir: Path = PosixPath('pdb/structures'), pdb_dir_info: Path = PosixPath('pdb/info'), seq_dir: Path = PosixPath('uniprot/fasta'), max_fetch_trials: int = 2, io_cpus: int = 1, init_cpus: int = 1, init_map_numbering_cpus: int = 1, init_add_structure_cpus: int = 1, init_tolerate_failures: bool = True, profile: Path = PosixPath('/home/docs/checkouts/readthedocs.org/user_builds/kinactive/checkouts/latest/kinactive/resources/PF00069.hmm'), tk2pk: Path = PosixPath('/home/docs/checkouts/readthedocs.org/user_builds/kinactive/checkouts/latest/kinactive/resources/tk2pk.json'), pk_map_name: str = 'PK', pk_min_score: float = 50, pk_min_seq_domain_size: int = 150, pk_min_str_domain_size: int = 100, pk_min_cov_hmm: float = 0.5, pk_min_cov_seq: float = 0.5, pk_min_str_seq_match: float = 0.8, min_seq_size: int = 150, max_seq_size: int = 5000, pdb_fmt: str = 'mmtf.gz', pdb_num_fetch_threads: int = 10, pdb_str_min_size: int = 100, uniprot_chunk_size: int = 100, uniprot_num_fetch_threads: int = 10) → None

init_add_structure_cpus: int = 1: #cpus for adding structures to a chain. Valid if init_map_numbering_cpus is > 1.

init_cpus: int = 1: #cpus for ChainInitializer (10-20 usually works fine).

init_map_numbering_cpus: int = 1: #cpus for pairwise sequence alignments. Increase to max number possible.

init_tolerate_failures: bool = True: Tolerate initialization failures. Should be True unless testing

io_cpus: int = 1: #cpus for ChainIO (10-20 usually works fine).

max_fetch_trials: int = 2: max trials for fetching an entry from external resources.

max_seq_size: int = 5000: maximum sequence size to filter raw sequences from UniProt.

min_seq_size: int = 150: minimum sequence size to filter raw sequences from UniProt.

pdb_dir: Path = PosixPath('pdb/structures'): raw PDB structures.

pdb_dir_info: Path = PosixPath('pdb/info'): info on PDB structures.

pdb_fmt: str = 'mmtf.gz': PDB files format.

pdb_num_fetch_threads: int = 10: The number of threads to use when fetching data from the PDB.

pdb_str_min_size: int = 100: The minimum structure size (in residues) to filter raw structures.

pk_map_name: str = 'PK': the domain name to use for extraction.

pk_min_cov_hmm: float = 0.5: min coverage of the hmm nodes.

pk_min_cov_seq: float = 0.5: min coverage of the sequence.

pk_min_score: float = 50: a minimum BitScore to qualify for hit.

pk_min_seq_domain_size: int = 150: min domain size for canonical sequences.

pk_min_str_domain_size: int = 100: min domain size for structure sequences.

pk_min_str_seq_match: float = 0.8: min matching residues’ fraction between structure and canonical sequences.

profile: Path = PosixPath('/home/docs/checkouts/readthedocs.org/user_builds/kinactive/checkouts/latest/kinactive/resources/PF00069.hmm'): A path to the PK profile (supplied with the package)

seq_dir: Path = PosixPath('uniprot/fasta'): raw UniProt sequences.

target_dir: Path = PosixPath('db'): database dump path.

tk2pk: Path = PosixPath('/home/docs/checkouts/readthedocs.org/user_builds/kinactive/checkouts/latest/kinactive/resources/tk2pk.json'): A map between TK and PK profile nodes

uniprot_chunk_size: int = 100: The chunk size to split UniProt ids into when fetching the data from UniProt.

uniprot_num_fetch_threads: int = 10: The number of threads to use when fetching the data from UniProt.

verbose: bool = True: progress bar output.

class kinactive.config.MatrixConfig(dir: Path = PosixPath('clustering'), n_super_pos: int = 30, pk_map_name: str = 'PK', n_proc: int | None = None, chunksize: int = 5000, df_pos: tuple[int, int] = (141, 142), bb_atoms: tuple[str, ...] = ('CA',), phe_atoms: tuple[str, ...] = ('CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ'), asp_atoms: tuple[str, ...] = ('CA', 'CB', 'CG', 'OD1', 'OD2'))[source]

Bases: object

The superposition-based matrix configuration. This matrix is used to compute

__init__(dir: Path = PosixPath('clustering'), n_super_pos: int = 30, pk_map_name: str = 'PK', n_proc: int | None = None, chunksize: int = 5000, df_pos: tuple[int, int] = (141, 142), bb_atoms: tuple[str, ...] = ('CA',), phe_atoms: tuple[str, ...] = ('CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ'), asp_atoms: tuple[str, ...] = ('CA', 'CB', 'CG', 'OD1', 'OD2')) → None

asp_atoms: tuple[str, ...] = ('CA', 'CB', 'CG', 'OD1', 'OD2'): DFG-Asp atom names used for RMSD computation.

bb_atoms: tuple[str, ...] = ('CA',): Backbone atom names used for superposing.

chunksize: int = 5000: The chunk size for distributing data between processes.

df_pos: tuple[int, int] = (141, 142): DFG-Asp/Phe positions.

dir: Path = PosixPath('clustering'): Path to dump the results.

n_proc: int | None = None: The number of cpus to use for parallel computation. Adjust carefully.

n_super_pos: int = 30: The number of the most covered HMM nodes to use for superposing.

phe_atoms: tuple[str, ...] = ('CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ'): DFG-Phe atom names used for RMSD computation.

pk_map_name: str = 'PK': The PK domain name. Should be the same as used in kinactive.db.DB.

kinactive.config.load_data_links(path: Path = PosixPath('/home/docs/checkouts/readthedocs.org/user_builds/kinactive/checkouts/latest/kinactive/resources/data_links.json')) → dict[str, str][source]