Running from command line
We created three separate Python scripts corresponding to the core modules of GWAStic. These scripts allow advanced users to input parameters and run the tool from the command line, giving flexibility to users who prefer this approach. This solution addresses the need for a command-line alternative while preserving the usability of the existing GUI for those who prefer a more visual, interactive experience. We believe this approach strikes the right balance, allowing a broader range of GWAS researchers to utilize the software in a way that best suits their workflow.
Usage:
1. Follow the Anaconda and gwastic installation instructions and activate you gwastic enviroment.
2. Save one of the Python script examples below on your local machine (we will use vcf_to_bed.py).
3. Adjust the parameters, make sure all file paths are set correctly.
4. Move to the path of your Python script in the commmand line.
5. Run the script with the command:
python vcf_to_bed.py
VCF to BED conversation
from gwastic_desktop.gwas_pipeline import GWAS
# Initialize the GWAS class
gwas = GWAS()
# Set the parameters manually
vcf_file = "test/example.vcf.gz"
id_file = None # If no ID file, keep it None; otherwise, provide the file path
file_out = "output"
maf = "0.05"
geno = "0.1"
# Call the vcf_to_bed method from the GWAS class
log = gwas.vcf_to_bed(vcf_file, id_file, file_out, maf, geno)
# Print or log the output
print(log)
GWAS analysis
import os
from gwastic_desktop.gwas_pipeline import GWAS
from gwastic_desktop.helpers import HELPERS
from pysnptools.snpreader import Bed, Pheno
import pysnptools.util as pstutil
def run_gwas():
"""Starts the GWAS pipeline."""
# Initialize the helper and GWAS class
helper = HELPERS()
gwas = GWAS()
# Manually set the parameters here
train_size_set = 0.2 # Adjust as needed (for example 80% training, i.e., 1 - 0.2)
estimators = 100 # Number of estimators for AI models
model_nr = 1 # Model number
snp_limit = 100 # Limit for the number of SNPs in the final output
nr_jobs = -1 # Number of parallel jobs (-1 uses all available cores)
gb_goal = 1 # GB limit for memory usage in GB
max_dep_set = 6 # Maximum depth for tree-based methods
algorithm = 'FaST-LMM' # Choose the algorithm to run, e.g., 'FaST-LMM', 'Random Forest (AI)', XGBoost (AI), Ridge Regression.
aggregation_method = "mean" # Aggregation method for AI models
# Manually set the paths for the required files
bed_path = "/test/example.bed"
pheno_path = "/test/pheno.csv"
cov_path = None # Optional: if you have a covariate file, provide the path here
try:
print('Validating files...')
check_input_data = gwas.validate_gwas_input_files(bed_path, pheno_path)
# Replace chromosome names, they need to be numbers
chrom_mapping = helper.replace_with_integers(bed_path.replace('.bed', '.bim'))
# If the input data is valid
if check_input_data[0]:
bed = Bed(str(bed_path), count_A1=False, chrom_map=chrom_mapping)
pheno = Pheno(str(pheno_path))
if cov_path:
cov = Pheno(str(cov_path))
else:
cov = None
# Replace original bed with one that has the same iids as the pheno
bed, pheno = pstutil.intersect_apply([bed, pheno])
bed_fixed = gwas.filter_out_missing(bed)
print(f"Dataset after intersection: SNPs: {bed.sid_count}, Pheno IDs: {pheno.iid_count}")
# Start the GWAS analysis
print('Starting Analysis...')
if algorithm == 'FaST-LMM' or algorithm == 'Linear regression':
gwas_df, df_plot = gwas.run_gwas_lmm(bed_fixed, pheno, chrom_mapping, print, "gwas_result", algorithm, bed_path, cov, gb_goal)
elif algorithm == 'Random Forest (AI)':
gwas_df, df_plot = gwas.run_gwas_rf(bed_fixed, pheno, bed_path, train_size_set, estimators, "gwas_result", chrom_mapping, print, model_nr, nr_jobs, aggregation_method)
elif algorithm == 'XGBoost (AI)':
gwas_df, df_plot = gwas.run_gwas_xg(bed_fixed, pheno, bed_path, train_size_set, estimators, "gwas_result", chrom_mapping, print, model_nr, max_dep_set, nr_jobs, aggregation_method)
elif algorithm == 'Ridge Regression':
gwas_df, df_plot = gwas.run_gwas_ridge(bed_fixed, pheno, bed_path, train_size_set, 1.0, "gwas_result", chrom_mapping, print, model_nr, aggregation_method)
if gwas_df is not None:
print('GWAS Analysis done.')
#print('Plotting results...')
gwas.plot_gwas(df_plot, snp_limit, algorithm, "manhattan_plot.png", "qq_plot.png", chrom_mapping)
print('Done!')
# Save the results
callback_save_results(os.getcwd(), gwas_df)
else:
print('Error: GWAS Analysis could not be started.')
else:
print(check_input_data[1])
except TypeError as e:
print(f'Error: {str(e)}')
print('Please select a phenotype and genotype file.')
def callback_save_results(results_directory, gwas_df):
"""Save the results inside the folder including tables as CSV and plots as PNG."""
# Set result file names
gwas_result_name = "gwas_results.csv"
gwas_result_name_top = "gwas_results_top10000.csv"
manhatten_plot_name = "manhattan_plot.png"
qq_plot_name = "qq_plot.png"
# Save the GWAS results to CSV
results_path = os.path.join(results_directory, gwas_result_name)
gwas_df.to_csv(results_path, index=False)
print(f"Results saved to {results_path}")
# Save the top 10,000 results
results_top_path = os.path.join(results_directory, gwas_result_name_top)
gwas_df.head(10000).to_csv(results_top_path, index=False)
print(f"Top 10,000 results saved to {results_top_path}")
# Optionally save the plots if they exist
# In this case, the plots should have been created by the `gwas.plot_gwas` method and saved as files.
# Log the results saving completion
print(f"Results saved in: {results_directory}")
# Execute the function
if __name__ == "__main__":
run_gwas()
Genomic Prediction
import os
from gwastic_desktop.gwas_pipeline import GWAS # Import necessary GWAS class
from gwastic_desktop.helpers import HELPERS # Import helpers class
from gwastic_desktop.gp_pipeline import GenomicPrediction
from pysnptools.snpreader import Bed, Pheno
import pysnptools.util as pstutil # Utility library for SNP data intersection
def run_genomic_prediction():
"""Starts the Genomic Prediction (GP) pipeline."""
# Initialize helper and GWAS (assuming GP methods are within the GWAS class)
helper = HELPERS()
gwas = GWAS()
genomic_predict_class = GenomicPrediction()
# Manually set the parameters here (previously from the GUI)
algorithm = 'GP_LMM' # Change as needed to 'Random Forest (AI)', 'XGBoost (AI)', etc.
test_size = 0.2 # Example: 80% training, 20% testing
estimators = 100 # Number of estimators for AI models
max_dep_set = 6 # Maximum depth for tree-based methods
model_nr = 5 # Model number
nr_jobs = -1 # Number of parallel jobs (-1 uses all available cores)
# Manually set the paths for the required files
bed_path = "/test/example.bed"
pheno_path = "/test/pheno.csv"
cov_path = None # Optional: if you have a covariate file, provide the path here
try:
# Log the progress
print('Reading and validating files...')
# Validate input files
check_input_data = gwas.validate_gwas_input_files(bed_path, pheno_path)
# Replace chromosome names, they need to be numbers
chrom_mapping = helper.replace_with_integers(bed_path.replace('.bed', '.bim'))
# If the input data is valid
if check_input_data[0]:
bed = Bed(str(bed_path), count_A1=False, chrom_map=chrom_mapping)
pheno = Pheno(str(pheno_path))
# Intersect Bed and Pheno data to ensure the same iids
bed, pheno = pstutil.intersect_apply([bed, pheno])
bed_fixed = gwas.filter_out_missing(bed)
# Log dataset details
print(f"Dataset after intersection: SNPs: {bed.sid_count}, Pheno IDs: {pheno.iid_count}")
# Start Genomic Prediction analysis
print('Starting Genomic Prediction analysis, this might take a while...')
if algorithm == 'GP_LMM':
gp_df = genomic_predict_class.run_lmm_gp(
bed_fixed, pheno, "genomic_prediction_results.csv", model_nr, print, bed_path, chrom_mapping)
elif algorithm == 'Random Forest (AI)':
gp_df = genomic_predict_class.run_gp_rf(
bed_fixed, pheno, bed_path, test_size, estimators, "genomic_prediction_results.csv", chrom_mapping, print, model_nr, nr_jobs)
elif algorithm == 'XGBoost (AI)':
gp_df = genomic_predict_class.run_gp_xg(
bed_fixed, pheno, bed_path, test_size, estimators, "genomic_prediction_results.csv", chrom_mapping, print, model_nr, max_dep_set, nr_jobs)
elif algorithm == 'Ridge Regression':
gp_df = genomic_predict_class.run_gp_ridge(
bed_fixed, pheno, bed_path, test_size, 1.0, "genomic_prediction_results.csv", chrom_mapping, print, model_nr)
# After analysis
if gp_df is not None:
print('Genomic Prediction done.')
genomic_predict_class.plot_gp(gp_df, "Bland_Altman_plot.png", algorithm)
genomic_predict_class.plot_gp_scatter(gp_df, "GP_scatter_plot.png", algorithm)
# Call the save results function
save_results(os.getcwd(), gp_df)
else:
print('Error: Genomic Prediction analysis could not be started.')
else:
print(check_input_data[1])
except TypeError as e:
print(f"Error: {str(e)}")
print("Please select a valid phenotype and genotype file.")
def save_results(results_directory, gp_df):
"""Save the results inside the folder including tables as csv and plots as png."""
# File names
genomic_predict_name = "genomic_prediction_results.csv"
manhattan_plot_name = "manhattan_plot.png"
qq_plot_name = "qq_plot.png"
gp_plot_name = "Bland_Altman_plot.png"
gp_plot_name_scatter = "GP_scatter_plot.png"
pheno_stats_name = 'pheno_statistics.pdf'
geno_stats_name = 'geno_statistics.pdf'
# Save the results to CSV
results_path = os.path.join(results_directory, genomic_predict_name)
gp_df.to_csv(results_path, index=False)
print(f"Results saved to {results_path}")
# Optionally save the plots (assuming the plots have already been saved by the `gwas.plot_gp` and `gwas.plot_gp_scatter` methods)
print(f"Results saved in: {results_directory}")
# Execute the function
if __name__ == "__main__":
run_genomic_prediction()
Last updated