Source code for kinactive.config

"""
Configuration dataclasses for the database, matrix and io.
"""
from dataclasses import dataclass
from pathlib import Path

from lXtractor.core.exceptions import MissingData

PK_NAME = "PK"
DFG_MAP = {"in": 0, "out": 1, "other": 2}
DFG_MAP_REV = {v: k for k, v in DFG_MAP.items()}


[docs]@dataclass class DBConfig: """ Database config. Default parameters were used to create lXt-PK data collection. To reproduce locally, you may change the paths (``*_dir*``) and adjust the number of cpus (``*_cpus``). """ #: progress bar output. verbose: bool = True #: database dump path. target_dir: Path = Path("db") #: raw PDB structures. pdb_dir: Path = Path("pdb") / "structures" #: info on PDB structures. pdb_dir_info: Path = Path("pdb") / "info" #: raw UniProt sequences. seq_dir: Path = Path("uniprot") / "fasta" #: max trials for fetching an entry from external resources. max_fetch_trials: int = 2 #: #cpus for ``ChainIO`` (10-20 usually works fine). io_cpus: int = 1 #: #cpus for ``ChainInitializer`` (10-20 usually works fine). init_cpus: int = 1 #: #cpus for pairwise sequence alignments. Increase to max number possible. init_map_numbering_cpus: int = 1 #: A path to the PK profile (supplied with the package) profile: Path = Path(__file__).parent / "resources" / "Pkinase.hmm" #: the domain name to use for extraction. pk_map_name: str = PK_NAME #: a minimum BitScore to qualify for hit. pk_min_score: float = 50 #: min domain size for canonical sequences. pk_min_seq_domain_size: int = 150 #: min domain size for structure sequences. pk_min_str_domain_size: int = 100 #: min coverage of the hmm nodes. pk_min_cov_hmm: float = 0.7 #: min coverage of the sequence. pk_min_cov_seq: float = 0.7 #: min matching residues' fraction between structure and canonical sequences. pk_min_str_seq_match: float = 0.9 #: minimum sequence size to filter raw sequences from UniProt. min_seq_size: int = 150 #: maximum sequence size to filter raw sequences from UniProt. max_seq_size: int = 3000 #: PDB files format. pdb_fmt: str = "cif" #: The number of threads to use when fetching data from the PDB. pdb_num_fetch_threads: int = 10 #: The minimum structure size (in residues) to filter raw structures. pdb_str_min_size: int = 100 #: The chunk size to split UniProt ids into when fetching the data from UniProt. uniprot_chunk_size: int = 100 #: The number of threads to use when fetching the data from UniProt. uniprot_num_fetch_threads: int = 10 def __post_init__(self): if not self.profile.exists(): raise MissingData(f"Missing PK profile under {self.profile} path")
[docs]@dataclass class MatrixConfig: """ The superposition-based matrix configuration. This matrix is used to compute """ #: Path to dump the results. dir: Path = Path("clustering") #: The number of the most covered HMM nodes to use for superposing. n_super_pos: int = 30 #: The PK domain name. Should be the same as used in :class:`kinactive.db.DB`. pk_map_name: str = PK_NAME #: The number of cpus to use for parallel computation. Adjust carefully. n_proc: int | None = None #: The chunk size for distributing data between processes. chunksize: int = 5000 #: DFG-Asp/Phe positions. df_pos: tuple[int, int] = (141, 142) #: Backbone atom names used for superposing. bb_atoms: tuple[str, ...] = ("CA",) #: DFG-Phe atom names used for RMSD computation. phe_atoms: tuple[str, ...] = ("CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ") #: DFG-Asp atom names used for RMSD computation. asp_atoms: tuple[str, ...] = ("CA", "CB", "CG", "OD1", "OD2")
@dataclass class _DumpNames: cls_keyword: str = "classifier" reg_keyword: str = "regressor" model_filename: str = "model.bin" features_filename: str = "features.txt" targets_filename: str = "targets.txt" params_filename: str = "params.json" in_model_dirname: str = "in" out_model_dirname: str = "out" other_model_dirname: str = "other" meta_model_dirname: str = "meta" positions_ca: str = "positions_CA.txt" distances: str = "distances.csv" summary_parent_seq = "initial_seq_summary.csv" summary_parent_str = "initial_str_summary.csv" summary_child_seq = "domain_seq_summary.csv" summary_child_str = "domain_str_summary.csv" canonical_seq_vs = "defaults_can_seq_vs.csv" structure_seq_vs = "defaults_str_seq_vs.csv" ligand_vs = "default_lig_vs.csv" structure_vs = "default_str_vs.csv" @property def summary_file_names(self) -> tuple[str, str, str, str]: return ( self.summary_parent_seq, self.summary_parent_str, self.summary_child_seq, self.summary_child_str, ) @dataclass class _ModelPaths: base: Path = Path(__file__).parent / "resources" / "models" kinactive_classifier: Path = base / "kinactive_classifier" dfg_classifier: Path = base / "DFG_classifier" @dataclass class _ColNames: dfg: str = "DFG" dfg_manual: str = "DFG_manual" dfg_pred = "DFG_pred" dfg_cls: str = "DFG_cls" dfg_cls_pred: str = "DFG_cls_pred" is_dfg_in: str = "is_DFG_in" is_dfg_out: str = "is_DFG_out" is_dfg_other: str = "is_DFG_other" dfg_in_proba: str = "in_proba" dfg_out_proba: str = "out_proba" dfg_other_proba: str = "other_proba" dfg_in_meta_prob: str = "in_meta_proba" dfg_out_meta_prob: str = "out_meta_prob" dfg_other_meta_prob: str = "other_meta_prob" rmsd_ca: str = "RMSD_CA" rmsd_df: str = "RMSD_DF" id_fix: str = "ID_fix" id_mob: str = "ID_mob" @property def is_dfg_cols(self) -> tuple[str, str, str]: return self.is_dfg_in, self.is_dfg_out, self.is_dfg_other @property def dfg_proba_cols(self) -> tuple[str, str, str]: return self.dfg_in_proba, self.dfg_out_proba, self.dfg_other_proba @property def dfg_meta_proba_cols(self) -> tuple[str, str, str]: return self.dfg_in_meta_prob, self.dfg_out_meta_prob, self.dfg_other_meta_prob @property def dfg_cols(self) -> list[str]: return [ self.dfg, self.dfg_pred, self.dfg_cls, self.dfg_cls_pred, *self.is_dfg_cols, *self.dfg_proba_cols, *self.dfg_meta_proba_cols, ] DumpNames = _DumpNames() ColNames = _ColNames() ModelPaths = _ModelPaths() DefaultMatrixConfig = MatrixConfig() if __name__ == "__main__": raise RuntimeError