Commit 03779420 authored by Michaela Olson's avatar Michaela Olson
Browse files

Upload all MorphEUS scripts

parent 58890591
legacy/R/.Rhistory
*.csv
*.m~
*.xlsx
*.asv
*.fig
classification_and_analysis/trial_results/*
classification_and_analysis/logs/*
version_info.md
\.DS_Store
.DS_Store
*.eps
# morpheus
# MorphEUS
Code for Morphological profiling of tubercle bacilli identifies drug pathways of action
\ No newline at end of file
Contains two main directories:
## segmentation_pipeline
Inside this directory is the code used to transform the .csv files outputted by MicrobeJ into a MATLAB workspace. This code filters out of focus and blurry cells using the transverse profile and calculates the median, Q1, Q3 and IQR for each replicate.
## classification_and_analysis
Inside this directory is the code used to run the classification trials as outlined in figure S3 as well as code to create the consensus KNN (cKNN) and heatmaps to visualize the results from the classification trials.
In addition, code is included to perform the Kruskal–Wallis tests to determine signfiicance of variables based on using single-cell data and not replicate averages as seen in Figure 1. There is also code to create PCAs that were used in the earlier versions of the analysis pipeline (as seen in Figures 2 and S1).
%% PCA Analysis with user prompts for workspace, number of variable, and drug selection
% DESCRIPTION:
% Prompts users to obtain the following variables
% - numvar
% - chosen_workspaces
% - chosen_drugs
% - drugs_to_apply
%
% Having declared these variables, runs the following scripts in this
% order:
% - load_workspaces
% - PCA_analysis
%
% PREREQUISITES: none
%
% OUTPUT: ~fill this out~
%% Clear workspace before start
%~~ Comment out to disable this feature ~~%
%clear
%% Add Paths
add_all_paths
%% Prompt for figure workspace
%figure_folder_path = inputdlg("Folder to save figures in:", "Figure Path", [1 35], "./figures/");
figure_folder_path = "./figures/";
%% Prompt for PCA_analysis settings
% Scripts like get_existing_feature_set set these variables, and we want
% them to be respected in the gui selection. If they're no set there
% though, they still need a default value.
if ~exist('do_feature_reduction','var')
do_feature_reduction = true;
end
if ~exist('remove_extra_controls','var')
remove_extra_controls = true;
end
disp('bad treated = IMI, Dau, Nig, Nal');
variables_to_create = {'disc', 'do_feature_reduction', 'include_DMSO', 'show_clustergram', 'show_pca', 'plot_3', 'plot_3_DMSO', 'plot_before_tvn', 'remove_INH_control', 'remove_extra_controls', 'remove_bad_treated', 'prompt_for_one_dose', 'use_medians_only'};
default_values = [true, do_feature_reduction, false, false, true, false, false, false, true, remove_extra_controls, true, false, false];
target_values = checkboxList("Select PCA_analysis Settings", variables_to_create, default_values);
initialize_variables %Initialize variables_to_create as target_values
if use_medians_only
numvar = 25;
end
%% make this global to make it accessible inside ver_figsave()
global chosen_workspaces
%% Select number of variables to use
if do_feature_reduction
prompt = {'Enter number of variables:'};
dlgtitle = 'Variable Number';
dims = [1 35];
definput = {'94'};
numvar = inputdlg(prompt,dlgtitle,dims,definput);
numvar = str2num(numvar{1});
else
if exist('pca_whitened_table','var')
numvar = sum(varfun(@isnumeric,pca_whitened_table,'OutputFormat', 'uniform'));
else
end
end
%% Find list of workspaces, prompt user for choice, and load chosen
if ~exist('workspace_directory', 'var')
workspace_directory = './workspaces';
disp("Defaulted workspace_directory to './workspaces' in load_workspaces")
end
workspace_list = get_workspaces(workspace_directory);
if isempty(workspace_list)
error(strcat("No workspaces found in ", workspace_directory))
end
% If we are operating from an existing feature set, automatically select
% the workspaces that were used to calculate that set
if exist('overall_vars', 'var')
try
default_selection = find(ismember(workspace_list, overall_vars.WORKSPACE));
catch
disp("Note: overall_vars does not have a WORKSPACE field")
default_selection = [];
end
else
default_selection = [];
end
% prompt the user to select
[indexes_chosen, any_chosen_tf] = listdlg('ListString',workspace_list,...
'Name',"Select workspace",'ListSize',[380 380],...
'InitialValue', default_selection);
% if the user did not select anything, exit (there is a catch for this in load_workspaces too)
if ~any_chosen_tf
return
end
% use the index to get the list of user-selected workspaces
chosen_workspaces = workspace_list(indexes_chosen);
%% run the script to load the workspaces
load_workspaces
%% From the list of drugs, prompt user to select which drugs to use and apply
% If we are operating from an existing feature set, automatically select
% the variables that were used to calculate that set
if exist('overall_vars', 'var')
default_selection = find(ismember(choice_variable, overall_vars.DRUGS));
else
default_selection = [];
end
suggestion = {'Mer','Amp','Ctax','INH','EMB','ETA','IMI','Van','Cyc','Del',...
'Lev','Mox','Clz','MIT','Olf','Kan','Amk','Cam','Cla','Dox','Gent',...
'Strep','Tet','Lin','Pre','CCCP','Cer','Mon','Nig','Thi','RifT','BDQ',...
'RIF','THL','water','Untreated'};
if exist('suggestion','var')
default_selection = find(ismember(choice_variable,suggestion));
end
% For extra_resolution_confusion
if prompt_for_one_dose
select_one_dose
end
% Prompt user to select a list of drugs
[indexes_chosen, any_chosen_tf] = listdlg('ListString',choice_variable,...
'Name',"Select Drugs",'ListSize',[220 380],'PromptString',"Select Drugs",...
'InitialValue', default_selection);
% if the user did not select anything, exit
if ~any_chosen_tf
return
end
% Get a list of the drugs chosen
chosen_drugs = choice_variable(indexes_chosen);
%prompt the user to select drugs to apply if they want to
[indexes_chosen, any_chosen_tf] = listdlg('ListString',choice_variable,'Name',"Select Drugs to apply",'ListSize',[220 380],...
'CancelString','No Selection');
% Get a list of drugs to apply
drugs_to_apply = choice_variable(indexes_chosen);
%% Run PCA Analysis
PCA_analysis
\ No newline at end of file
# MorphEUS Classification Trials and cKNNs
Performs a classification trial on a workspace created with MicrobeJ_segmentation
Currently set to use default workspaces used in the MophEUS paper, can be adjusted.
Run the script classification_trials.m to run a set of 70 classification trials. Results will be automatically saved into trial_results with the name the user selected along with a timestamp. A log containing all of the output produced during hte run will be saved into the logs folder.
### Before you run
In order to run a classification trial, you need to download Hanchuan Peng's mRMR feature selection algorithm and compile it to work on your operating system. It can be found here:
[mRMR Feature Selection] (https://www.mathworks.com/matlabcentral/fileexchange/14608-mrmr-feature-selection-using-mutual-information-computation?s_tid=prof_contriblnk). Place the resulting mRMR_0.9_compiled folder in the classification_trials directory.
### Workspaces for current presets
Joint profile: full_025x_and_3x_redos_no_A22.mat
Timecourse applying: all_timecourse_doseresponse.mat
High dose: full_3x_with_redos_no_A22_drug_name_only.mat
Low dose: full_025x_with_redos_v3_drug_names_only.mat
## Examining the data
To look through a set of classification trials, run cKNN_creator.m and select the desired workspace.
To create boxplots using cell data created with MicrobeJ_segmentation, run boxplots_and_KW.m and selecte the desired workspace(s).
To create a PCA, run PCA_creator.m.
%% quick script to add all subfolders to path
addpath('./analysis')
addpath('./application_loops')
addpath('./functions')
addpath('./helper')
addpath('./mRMR_0.9_compiled')
addpath('./workspaces')
\ No newline at end of file
%% Script to perform KNN on data outputted by PCA_analysis
% This script uses existing variables in the workspace to perform KNN
% and output grahps
%
% PREREQUISITES:
% - ** Must have run PCA_analysis.m and all of its prerequisites **
% - Most important variable is scores_table
% - All required settings in section below
% - Any desired optional variables listed below
%
% OUTPUT:
% -
%
% DEPENDENCIES:
% - knn_helper ***
%
%% Required setting variables
%%% Must include all of these variables in the workspace before running %%%
%
% - make_median - boolean whether to make a plot of replicate medians
% - make_full - boolean whether to make a plot with each replicate as a point
% d_metric, create_graph, create_cm, large_group
%% Optional settings
%%% Optionally include these variables in the workspace before running if %
%%% desired. If not present, they will default to the specified value %
%
% - random_compare - (default: false) - make a randomized label plot to compare
% - tit_add - (default: "") -
%% Add paths
addpath('./functions')
addpath('./helper/')
%% Generate defaults for optional settings
if ~exist('random_compare', 'var')
random_compare = false;
disp('defaulted random_compare to false');
end
if ~exist('tit_add', 'var')
tit_add = "";
disp('defaulted tit_add to ""');
end
if ~exist('confuse_controls','var')
confuse_controls = false;
end
if confuse_controls
make_median = false;
make_full = true;
end
disp("begining of knn_analysis, value of remove_after_TVN")
disp(remove_after_TVN)
%% Validate required settings
%From PCA_analysis -- just check for these two; they're the most important
if ~exist('scores_table', 'var')
error('Fatal error: Must run PCA_analysis before KNN_analysis; missing scores_table')
end
if ~exist('choice_variable', 'var')
error('Fatal error: Must run PCA_analysis before KNN_analysis; missing choice_variable')
end
%Extra required variables for this script
if ~exist('make_median', 'var')
error('Fatal error: Missing required settings variable make_median in KNN_analysis')
end
if ~exist('make_full', 'var')
error('Fatal error: Missing required settings variable make_full in KNN_analysis')
end
%% -- KNN analysis begins here --
%% if doing full analysis with randomized labels (most likely just for backwads_select purposes)
%% Using each image after PCA
% using scores table--reassigning to new variable here to make it flexible
if make_full
knn_type = 'full';
knn_with_apply = false;
knn_table = scores_table;
knn_data = knn_table{:,numeric_final_data_cols};
knn_drugs = knn_table.(choice_extension);
knn_helper
end
%% Using medians after PCA
if make_median
tit_add = "";
knn_type = 'median';
knn_with_apply = false;
if after_pca
% if after pca, want to use scores table
knn_after_pca = "KNN after PCA";
% use separate script to get median values for each set of drugs in
% knn_table
[drug_medians,drugs] = table_median(scores_table,choice_extension);
else
% if before pca, want to use pca_whitened_table
knn_after_pca = "KNN before PCA";
% use separate script to get median values for each set of drugs in
% knn_table
[drug_medians,drugs] = table_median(pca_whitened_table,choice_extension);
end
%~~~ Perform knn ~~~%
% go through each drug, make a separate row for the indiviudal drug and
% compare it against the remainder
% use medians as knn_data
knn_data = drug_medians;
knn_drugs = drugs';
disp("list of drugs for knn in knn_analysis")
disp(knn_drugs)
% run helper script
knn_helper
% compare to random labels
if random_compare
disp("Rerunning with random labels")
tit_add = " with random labels";
% copy list of knn_drusg
random_drugs = randomize_list(knn_drugs);
% assign knn_drugs to be the new randomly created set
knn_drugs = random_drugs;
knn_helper
end
end
%% knn for applied -> individual
if apply
if make_full
knn_type = 'full';
knn_with_apply = true;
% using scores table--reassigning to new variable here to make it flexible
knn_table = newspace_table;
knn_data = knn_table{:,numeric_final_data_cols};
knn_drugs = knn_table.DRUG;
knn_helper
end
end
%% create knn for applied
if apply
if make_median
knn_type = 'median';
knn_with_apply = true;
% get median values with external function
[drug_medians,drugs] = table_median(newspace_table,choice_extension);
%% Perform knn
% go through each drug, make a separate row for the indiviudal drug and
% compare it against the remainder
% use medians as knn_data
knn_data = drug_medians;
knn_drugs = drugs';
knn_helper
disp("did applied knn")
end
end
\ No newline at end of file
This diff is collapsed.
%% backwards selecvt analysis
% all of the parts of backwards select without the prompts to load or save
% set this as as safety
if ~exist('apply','var')
apply = false;
end
if ~exist('no_TVN','var')
no_TVN = false;
end
%% Start backwards select
% start off by getting original percet
disp(strcat(num2str(pct_correct), "% for ", num2str(numvar), " variables"))
% this is what we are looking to beat
baseline_score = pct_correct;
% minumum variable number to run until
minvar = 22;
% set keep_going to true to get while loop started
keep_going = true;
% save all of the best variable iterations
overall_vars = struct;
% use external function to get git info
git_info = getGitInfo();
% can save hash info in overall vars
try
overall_vars.hash = git_info.hash;
catch
% have a catch in case repository wasn't cloned
disp("unable to to save git hash information")
end
% store the drugs used to create this in overall_vars
overall_vars.DRUGS = unique(final_data_table.DRUG)';
% hard coding this for now b/c we're having problems
% remove_after_TVN = true;
if remove_after_TVN
% not actually using untreated, want to remove from .DRUG for clarity
all_drugs_from_final = unique(final_data_table.DRUG)';
unt_loca = find(strcmp(all_drugs_from_final,"Untreated"));
all_drugs_from_final(unt_loca) = [];
overall_vars.DRUGS = all_drugs_from_final;
end
% save the workspace used
overall_vars.WORKSPACE = chosen_workspaces;
% save whether efflux was in lipid or not
overall_vars.EFFLUX_IN_LIPID = efflux_in_lipid;
disp(strcat("distance metric is ", d_metric))
if large_group
disp("using broader groups")
else
disp("using finer groups")
end
% use randomize_labels function on the drug column in final_data_table
if with_randomized_labels
% determine the randomized order
rand_order = randperm(length(final_data_table.DRUG));
disp("random!")
overall_vars.RANDOM = true;
end
% store in the overall_vars if joint profile or not
if do_joint_profile
overall_vars.JOINT_PROFILE = true;
else
overall_vars.JOINT_PROFILE = false;
end
% once the max value is not going up anymore, we stop this loop
disp("Beginning backwards select....")
disp("value of remove_after_TVN")
disp(remove_after_TVN)
while keep_going
% want to make an empty % error thing to store
all_pcts = zeros(length(starting_vars),1);
% reset numvar
numvar = length(kept_vars);
% get title for structure
field_title = strcat("vars_",num2str(numvar),"_pct_",num2str(floor(baseline_score)));
% save current number of variables in structure
overall_vars.(field_title) = kept_vars;
%%% might also want to create a confusion matrix and save its handle in
%%% overall_vars?
%% Now want to loop through, removing one variable at a time from starting_vars and calculating error
for p = 1:length(kept_vars)
% reset to the full list of variables
starting_vars = kept_vars;
% get the name of the var removed
var_removed = starting_vars{p};
% remove a variable from the list
starting_vars(p) = [];
% perform TVN with newly selected features
if no_TVN
pca_whitened_table = final_data_table;
else
TVN_transform
end
% if applied drug, remove here
if apply
for qi = addedDrug
added_drug_1 = qi{1};
applied_inds = find(strcmp(pca_whitened_table.(choice_extension),added_drug_1));
% remove!!!
pca_whitened_table(applied_inds,:) = [];
end
end
% if we want to get rid of the untreated we gotta do it every time
%% Remove after TVN
% forcing this true for now but change later
if remove_after_TVN
drugs = unique(pca_whitened_table.DRUG)';
drug_indexes = cell2struct(cell(1,length(drugs)), drugs, 2);
for i = drugs
drug = i{1};
drug_indexes.(drug) = find(strcmp(pca_whitened_table.DRUG, drug)).';
end
pca_whitened_table(drug_indexes.Untreated,:) = [];
drugs = unique(pca_whitened_table.DRUG)';
drug_indexes = cell2struct(cell(1,length(drugs)), drugs, 2);
for i = drugs
drug = i{1};
drug_indexes.(drug) = find(strcmp(pca_whitened_table.DRUG, drug)).';
end
end
%% with randomized labels
if with_randomized_labels
% go through each drug and reorder in the random order defined
randomized_drugs = pca_whitened_table.DRUG;
for w = 1:length(randomized_drugs)
randomized_drugs(w) = pca_whitened_table.DRUG(rand_order(w));
end
pca_whitened_table.DRUG = randomized_drugs;
end
%% now time for the analysis
%find numeric columns of final table
numeric_final_data_cols = varfun(@isnumeric,pca_whitened_table,'OutputFormat', 'uniform');
%find column names
numeric_final_col_names = pca_whitened_table.Properties.VariableNames(numeric_final_data_cols);
if plot_before_tvn
%find numeric columns of final table
numeric_final_data_cols = varfun(@isnumeric,final_data_table,'OutputFormat', 'uniform');
numeric_final_col_names = final_data_table.Properties.VariableNames(numeric_final_data_cols);