Running from command line

We created three separate Python scripts corresponding to the core modules of GWAStic. These scripts allow advanced users to input parameters and run the tool from the command line, giving flexibility to users who prefer this approach. This solution addresses the need for a command-line alternative while preserving the usability of the existing GUI for those who prefer a more visual, interactive experience. We believe this approach strikes the right balance, allowing a broader range of GWAS researchers to utilize the software in a way that best suits their workflow.

Usage:
1. Follow the Anaconda and gwastic installation instructions and activate you gwastic enviroment.
2. Save one of the Python script examples below on your local machine (we will use vcf_to_bed.py).
3. Adjust the parameters, make sure all file paths are set correctly.
4. Move to the path of your Python script in the commmand line.
5. Run the script with the command:
python vcf_to_bed.py

VCF to BED conversation

from gwastic_desktop.gwas_pipeline import GWAS

# Initialize the GWAS class
gwas = GWAS()

# Set the parameters manually 
vcf_file = "test/example.vcf.gz"
id_file = None  # If no ID file, keep it None; otherwise, provide the file path
file_out = "output"
maf = "0.05"
geno = "0.1"

# Call the vcf_to_bed method from the GWAS class
log = gwas.vcf_to_bed(vcf_file, id_file, file_out, maf, geno)

# Print or log the output
print(log)

GWAS analysis

import os
from gwastic_desktop.gwas_pipeline import GWAS
from gwastic_desktop.helpers import HELPERS
from pysnptools.snpreader import Bed, Pheno
import pysnptools.util as pstutil

def run_gwas():
    """Starts the GWAS pipeline."""
    
    # Initialize the helper and GWAS class
    helper = HELPERS()
    gwas = GWAS()

    # Manually set the parameters here
    train_size_set = 0.2  # Adjust as needed (for example 80% training, i.e., 1 - 0.2)
    estimators = 100  # Number of estimators for AI models
    model_nr = 1  # Model number
    snp_limit = 100  # Limit for the number of SNPs in the final output
    nr_jobs = -1  # Number of parallel jobs (-1 uses all available cores)
    gb_goal = 1  # GB limit for memory usage in GB
    max_dep_set = 6  # Maximum depth for tree-based methods
    algorithm = 'FaST-LMM'  # Choose the algorithm to run, e.g., 'FaST-LMM', 'Random Forest (AI)', XGBoost (AI), Ridge Regression.
    aggregation_method = "mean"  # Aggregation method for AI models

    # Manually set the paths for the required files
    bed_path = "/test/example.bed"
    pheno_path = "/test/pheno.csv"
    cov_path = None  # Optional: if you have a covariate file, provide the path here
    
    try:
        print('Validating files...')
        check_input_data = gwas.validate_gwas_input_files(bed_path, pheno_path)

        # Replace chromosome names, they need to be numbers
        chrom_mapping = helper.replace_with_integers(bed_path.replace('.bed', '.bim'))

        # If the input data is valid
        if check_input_data[0]:
            bed = Bed(str(bed_path), count_A1=False, chrom_map=chrom_mapping)
            pheno = Pheno(str(pheno_path))
            if cov_path:
                cov = Pheno(str(cov_path))
            else:
                cov = None

            # Replace original bed with one that has the same iids as the pheno
            bed, pheno = pstutil.intersect_apply([bed, pheno])
            bed_fixed = gwas.filter_out_missing(bed)

            print(f"Dataset after intersection: SNPs: {bed.sid_count}, Pheno IDs: {pheno.iid_count}")

            # Start the GWAS analysis
            print('Starting Analysis...')
            if algorithm == 'FaST-LMM' or algorithm == 'Linear regression':
                gwas_df, df_plot = gwas.run_gwas_lmm(bed_fixed, pheno, chrom_mapping, print, "gwas_result", algorithm, bed_path, cov, gb_goal)
            elif algorithm == 'Random Forest (AI)':
                gwas_df, df_plot = gwas.run_gwas_rf(bed_fixed, pheno, bed_path, train_size_set, estimators, "gwas_result", chrom_mapping, print, model_nr, nr_jobs, aggregation_method)
            elif algorithm == 'XGBoost (AI)':
                gwas_df, df_plot = gwas.run_gwas_xg(bed_fixed, pheno, bed_path, train_size_set, estimators, "gwas_result", chrom_mapping, print, model_nr, max_dep_set, nr_jobs, aggregation_method)
            elif algorithm == 'Ridge Regression':
                gwas_df, df_plot = gwas.run_gwas_ridge(bed_fixed, pheno, bed_path, train_size_set, 1.0, "gwas_result", chrom_mapping, print, model_nr, aggregation_method)

            if gwas_df is not None:
                print('GWAS Analysis done.')
                #print('Plotting results...')
                gwas.plot_gwas(df_plot, snp_limit, algorithm, "manhattan_plot.png", "qq_plot.png", chrom_mapping)
                print('Done!')
                # Save the results
                callback_save_results(os.getcwd(), gwas_df)
            else:
                print('Error: GWAS Analysis could not be started.')

        else:
            print(check_input_data[1])

    except TypeError as e:
        print(f'Error: {str(e)}')
        print('Please select a phenotype and genotype file.')

def callback_save_results(results_directory, gwas_df):
    """Save the results inside the folder including tables as CSV and plots as PNG."""
    # Set result file names
    gwas_result_name = "gwas_results.csv"
    gwas_result_name_top = "gwas_results_top10000.csv"
    manhatten_plot_name = "manhattan_plot.png"
    qq_plot_name = "qq_plot.png"
    
    # Save the GWAS results to CSV
    results_path = os.path.join(results_directory, gwas_result_name)
    gwas_df.to_csv(results_path, index=False)
    print(f"Results saved to {results_path}")

    # Save the top 10,000 results
    results_top_path = os.path.join(results_directory, gwas_result_name_top)
    gwas_df.head(10000).to_csv(results_top_path, index=False)
    print(f"Top 10,000 results saved to {results_top_path}")

    # Optionally save the plots if they exist
    # In this case, the plots should have been created by the `gwas.plot_gwas` method and saved as files.

    # Log the results saving completion
    print(f"Results saved in: {results_directory}")

# Execute the function
if __name__ == "__main__":
    run_gwas()

Genomic Prediction

import os
from gwastic_desktop.gwas_pipeline import GWAS  # Import necessary GWAS class
from gwastic_desktop.helpers import HELPERS  # Import helpers class
from gwastic_desktop.gp_pipeline import GenomicPrediction
from pysnptools.snpreader import Bed, Pheno
import pysnptools.util as pstutil  # Utility library for SNP data intersection


def run_genomic_prediction():
    """Starts the Genomic Prediction (GP) pipeline."""

    # Initialize helper and GWAS (assuming GP methods are within the GWAS class)
    helper = HELPERS()
    gwas = GWAS()
    genomic_predict_class = GenomicPrediction()

    # Manually set the parameters here (previously from the GUI)
    algorithm = 'GP_LMM'  # Change as needed to 'Random Forest (AI)', 'XGBoost (AI)', etc.
    test_size = 0.2  # Example: 80% training, 20% testing
    estimators = 100  # Number of estimators for AI models
    max_dep_set = 6  # Maximum depth for tree-based methods
    model_nr = 5  # Model number
    nr_jobs = -1  # Number of parallel jobs (-1 uses all available cores)
    
    # Manually set the paths for the required files
    bed_path = "/test/example.bed"
    pheno_path = "/test/pheno.csv"
    cov_path = None  # Optional: if you have a covariate file, provide the path here

    try:
        # Log the progress
        print('Reading and validating files...')

        # Validate input files
        check_input_data = gwas.validate_gwas_input_files(bed_path, pheno_path)

        # Replace chromosome names, they need to be numbers
        chrom_mapping = helper.replace_with_integers(bed_path.replace('.bed', '.bim'))

        # If the input data is valid
        if check_input_data[0]:
            bed = Bed(str(bed_path), count_A1=False, chrom_map=chrom_mapping)
            pheno = Pheno(str(pheno_path))

            # Intersect Bed and Pheno data to ensure the same iids
            bed, pheno = pstutil.intersect_apply([bed, pheno])
            bed_fixed = gwas.filter_out_missing(bed)

            # Log dataset details
            print(f"Dataset after intersection: SNPs: {bed.sid_count}, Pheno IDs: {pheno.iid_count}")

            # Start Genomic Prediction analysis
            print('Starting Genomic Prediction analysis, this might take a while...')

            if algorithm == 'GP_LMM':
                gp_df = genomic_predict_class.run_lmm_gp(
                    bed_fixed, pheno, "genomic_prediction_results.csv", model_nr, print, bed_path, chrom_mapping)
            elif algorithm == 'Random Forest (AI)':
                gp_df = genomic_predict_class.run_gp_rf(
                    bed_fixed, pheno, bed_path, test_size, estimators, "genomic_prediction_results.csv", chrom_mapping, print, model_nr, nr_jobs)
            elif algorithm == 'XGBoost (AI)':
                gp_df = genomic_predict_class.run_gp_xg(
                    bed_fixed, pheno, bed_path, test_size, estimators, "genomic_prediction_results.csv", chrom_mapping, print, model_nr, max_dep_set, nr_jobs)
            elif algorithm == 'Ridge Regression':
                gp_df = genomic_predict_class.run_gp_ridge(
                    bed_fixed, pheno, bed_path, test_size, 1.0, "genomic_prediction_results.csv", chrom_mapping, print, model_nr)
            
            # After analysis
            if gp_df is not None:
                print('Genomic Prediction done.')
                genomic_predict_class.plot_gp(gp_df, "Bland_Altman_plot.png", algorithm)
                genomic_predict_class.plot_gp_scatter(gp_df, "GP_scatter_plot.png", algorithm)
                # Call the save results function
                save_results(os.getcwd(), gp_df)

            else:
                print('Error: Genomic Prediction analysis could not be started.')

        else:
            print(check_input_data[1])

    except TypeError as e:
        print(f"Error: {str(e)}")
        print("Please select a valid phenotype and genotype file.")


def save_results(results_directory, gp_df):
    """Save the results inside the folder including tables as csv and plots as png."""

    # File names
    genomic_predict_name = "genomic_prediction_results.csv"
    manhattan_plot_name = "manhattan_plot.png"
    qq_plot_name = "qq_plot.png"
    gp_plot_name = "Bland_Altman_plot.png"
    gp_plot_name_scatter = "GP_scatter_plot.png"
    pheno_stats_name = 'pheno_statistics.pdf'
    geno_stats_name = 'geno_statistics.pdf'

    # Save the results to CSV
    results_path = os.path.join(results_directory, genomic_predict_name)
    gp_df.to_csv(results_path, index=False)
    print(f"Results saved to {results_path}")

    # Optionally save the plots (assuming the plots have already been saved by the `gwas.plot_gp` and `gwas.plot_gp_scatter` methods)

    print(f"Results saved in: {results_directory}")


# Execute the function
if __name__ == "__main__":
    run_genomic_prediction()

PreviousGenomic Prediction NextAlgorithms

Last updated 9 months ago