flatten_dose_response.m 14.1 KB
Newer Older
Michaela Olson's avatar
Michaela Olson committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
%% Flattens multiple doses into columns
%
%
%

%% Find info about chosen_doses
num_chosen_doses = length(chosen_doses);

%% %% Add ID_EXP and DOSE column

%~~ Find numeric and non-numeric parts of table ~~ %
%find numeric columns of final table
numeric_final_data_cols = varfun(@isnumeric,final_data_table,'OutputFormat', 'uniform');
% numeric data
numeric_data = final_data_table(:,numeric_final_data_cols);
% Get table of non numeric values
non_numeric = final_data_table(:,~numeric_final_data_cols);

%~~ Create new column ~~%
%id_exp
id_exp = cellstr(strcat(final_data_table.ID,"_",final_data_table.EXP));
non_numeric.ID_EXP = id_exp;

%dose
dose_col = cellstr(extractAfter(final_data_table.DRUG, strcat(final_data_table.ID,"_")));
non_numeric.DOSE = dose_col;

% reorganize table with new column
final_data_table = [non_numeric numeric_data];


%% Recalculate numeric columns and data

%find numeric columns of final table
numeric_final_data_cols = varfun(@isnumeric,final_data_table,'OutputFormat', 'uniform');
%find column names
numeric_final_col_names = final_data_table.Properties.VariableNames(numeric_final_data_cols);
% numeric data
numeric_data = final_data_table(:,numeric_final_data_cols);
% Get table of non numeric values
non_numeric = final_data_table(:,~numeric_final_data_cols);

%% Split tables into tree
%Looks somethihg like this:
%
%               rep1   dose1
%          id1- rep2 - dose2
%   exp1 -      rep3   dose3
% -        id2
%   exp2
%
%

exp_list = unique(final_data_table.EXP).';

split_tables = struct();

% Split by EXP
for i=exp_list
    exp = i{1};
    
    %Get subtable for this experiment
    exp_table = final_data_table(strcmp(final_data_table.EXP, exp), :);

    %Branch struct
    split_tables.(exp) = struct();
    
    % Split by ID
    for j=chosen_ids
        id = j{1};

        %Get subtable for this id
        id_table = exp_table(strcmp(exp_table.ID, id), :);

        %Branch struct
        split_tables.(exp).(id) = struct();
        
        % Split by REP
        rep_list = unique(id_table.REP).';
        for k=rep_list
            rep = k{1};
            
            %Get subtable for this rep
            rep_table = id_table(strcmp(id_table.REP, rep), :);

            %Branch struct
            split_tables.(exp).(id).(rep) = struct();
            
            % Split by drug (drug_dose)
            for q=chosen_drugs_by_id.(id)
                drug = q{1};
                
                %Get subtable for this dose
                drug_table = rep_table(strcmp(rep_table.DRUG, drug), :);

                % Populate drug_dose field with table
                split_tables.(exp).(id).(rep).(drug) = drug_table;
            end
        end

    end
end

% %% Ensure rep list is valid
% if sum(strcmp({'rep1', 'rep2', 'rep3'}, rep_list)) ~= length(rep_list)
%     error("Rep list invalid; fix the code you dummy")
% end

%% for Untreated, we can just put all of the untreated into one rep and call it a day
% this is just a fix for that bayes stuff right now le'ts go
if bayesian
112
    unt_reps = fieldnames(split_tables.drug34_doseResponse.Untreated)';
Michaela Olson's avatar
Michaela Olson committed
113
114
115

    % add a new field for all reps

116
    split_tables.drug34_doseResponse.Untreated.all_reps = struct;
Michaela Olson's avatar
Michaela Olson committed
117
118
119
120

    % want to make this flexible later but now just need to get things going 
    for i = unt_reps
        rep=i{1};
121
122
        unt3x = split_tables.drug34_doseResponse.Untreated.(rep).Untreated_3x;
        unt025x = split_tables.drug34_doseResponse.Untreated.(rep).Untreated_025x;
Michaela Olson's avatar
Michaela Olson committed
123
124

        if strcmp(rep,"rep1")
125
126
            split_tables.drug34_doseResponse.Untreated.all_reps.Untreated_3x = unt3x;
            split_tables.drug34_doseResponse.Untreated.all_reps.Untreated_025x = unt025x;
Michaela Olson's avatar
Michaela Olson committed
127
        else
128
129
130
131
            split_tables.drug34_doseResponse.Untreated.all_reps.Untreated_3x = ...
                vertcat(split_tables.drug34_doseResponse.Untreated.all_reps.Untreated_3x, unt3x);
            split_tables.drug34_doseResponse.Untreated.all_reps.Untreated_025x = ...
                vertcat(split_tables.drug34_doseResponse.Untreated.all_reps.Untreated_025x, unt025x);
Michaela Olson's avatar
Michaela Olson committed
132
133

        end
134
135
        split_tables.drug34_doseResponse.Untreated = ...
            rmfield(split_tables.drug34_doseResponse.Untreated,rep);
Michaela Olson's avatar
Michaela Olson committed
136
137
138
139
140
    end 

    % go replace the rep column to all have the same thing so it works on next
    % step

141
    maxlen = length(split_tables.drug34_doseResponse.Untreated.all_reps.Untreated_025x.REP);
Michaela Olson's avatar
Michaela Olson committed
142
143

    for i = 1:maxlen
144
        split_tables.drug34_doseResponse.Untreated.all_reps.Untreated_025x.REP(i) = ...
Michaela Olson's avatar
Michaela Olson committed
145
            {'ALL'};
146
        split_tables.drug34_doseResponse.Untreated.all_reps.Untreated_3x.REP(i) = ...
Michaela Olson's avatar
Michaela Olson committed
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
            {'ALL'};


    end 

end 
%% Fill subtables for drugs that are missing any doses

%loop through exp
for i=exp_list
    exp = i{1};
    
    %loop through id
    for j=chosen_ids
        id = j{1};
        rep_list = fieldnames(split_tables.(exp).(id)).';
        
        %loop through rep
        for k=rep_list
            rep=k{1};
            
            % If there are fewer doses for this ID than for other ones
            if length(chosen_drugs_by_id.(id)) < num_chosen_doses

                % Display to console that we're doing this
                disp(strcat("Filling in horizontal data for ",...
                    id, " at [", exp, ", ", rep, "] (Only have ",...
                    num2str(length(chosen_drugs_by_id.(id))), " dose(s), but need ",...
                    num2str(num_chosen_doses), ")"))

                % Copy the table from the first dose recorded for this drug
                %(99% of the time there will only 1; eg Untreated / DMSO)
                drug_to_copy = chosen_drugs_by_id.(id){1}; 
                table_to_copy = split_tables.(exp).(id).(rep).(drug_to_copy);

                %~~ Insert all fields needed with generated names and copy of same table
                split_tables.(exp).(id).(rep) = struct();
                % Generate names (eg DMSO1, DMSO2)
                names = cellfun(@(x) strcat(id, num2str(x)), num2cell(1:num_chosen_doses), 'UniformOutput', false);
                for w=names
                    name = w{1};
                    split_tables.(exp).(id).(rep).(name) = table_to_copy;
                end
                
                % Fix chosen_doses_by_drug
                chosen_doses_by_drug.(id) = cell2struct(chosen_doses, names, 2);
            end
        end
    end
end



%% Horizontally concatenate doses for each rep and fix table columns
missing_data = "";
combined_tables = struct();
final_combined_table = table();

% set the rng for reproducibility
rng(rng_val);

%loop through exp
for i=exp_list
    exp = i{1};
    
    combined_tables.(exp) = struct();
    
    %loop through id
    for j=chosen_ids
        id = j{1};
        
        combined_tables.(exp).(id) = struct();
        
        rep_list = fieldnames(split_tables.(exp).(id)).';
        %loop through rep
        for k=rep_list
            rep = k{1};
            %disp(strcat("Combining doses for: [", exp, ", ", id, ", ", rep, "]")) 
            
            fields = fieldnames(split_tables.(exp).(id).(rep));
            
            % Subset each dose table to numeric cols and non numeric cols
            non_numeric_tables = structfun(@(t) t(:, ~numeric_final_data_cols), split_tables.(exp).(id).(rep), 'UniformOutput', false);
            numeric_tables = structfun(@(t) t(:, numeric_final_data_cols), split_tables.(exp).(id).(rep), 'UniformOutput', false);

            % Append dose to each numeric column name
            for q=fieldnames(numeric_tables).'
                field = q{1};

                dose = chosen_doses_by_drug.(id).(field);

                numeric_tables.(field).Properties.VariableNames = cellfun(@(c) strcat(c, '_', dose), ...
                    numeric_tables.(field).Properties.VariableNames, 'UniformOutput', false);
            end

            %~~ HorzCat tables ~~%
            
            %find tables with extra rows
            table_rows = structfun(@(t) size(t, 1), split_tables.(exp).(id).(rep), 'UniformOutput', false);
            table_rows_mat = cell2mat(struct2cell(table_rows));
            min_num_rows = min(table_rows_mat);
            
            num_extra_rows = structfun(@(n) n - min_num_rows, table_rows, 'UniformOutput', false);
            indx = structfun(@(n) n > 0, num_extra_rows, 'UniformOutput', true).';
            drugs_with_extra_rows = fields(indx).';
            
            % Remove extra rows
            if ~isempty(drugs_with_extra_rows)
                for q=drugs_with_extra_rows
                    drug = q{1};

                    % Generate random row indexes to remove
                    indexes_to_remove = randperm(table_rows.(drug), num_extra_rows.(drug));

                    % Find img col for display
                    local_imgs = non_numeric_tables.(drug).IMG;

                    % Remove from both numeric and non_numeric tables
                    numeric_tables.(drug)(indexes_to_remove, :) = [];
                    non_numeric_tables.(drug)(indexes_to_remove, :) = [];

                    % Tell user what we have done
                    disp(strcat(drug, " at [", exp, ", ", rep, "] had ", num2str(num_extra_rows.(drug)), ...
                        " too many rows. Removing images (randomly picked): "))
                    removed_local_imgs = local_imgs(indexes_to_remove).';
                    for w = removed_local_imgs
                        disp(strcat(" - ", w{1}))
                    end
                end
            end
            
            % Loop through rows that will be combined, and aggregate
            % non_numeric data from each field in the struct
            combined_non_numeric_header = table();
            for r=1:min_num_rows
                % Get numeric cols - rows=rth row one from each dose
                rth_rows = cellfun(@(t) t(r, :), struct2cell(non_numeric_tables), 'UniformOutput', false);
                original_header = vertcat(rth_rows{:});
                
                
                % Calculate extra cols we want
                
                %imgs (IMG_025x, IMG_3x ...)
                img_cols = table();
                for g=fieldnames(non_numeric_tables).'
                    drug = g{1};
                    img_cols.(strcat('IMG_', chosen_doses_by_drug.(id).(drug))) = convertCharsToStrings(non_numeric_tables.(drug).IMG{r});
                end
                img_cols.IMG = strjoin(img_cols{1,:}, '-@-');
                
                % Date 
                date_col = table();
                if all(strcmp(original_header.DATE{1}, original_header.DATE))
                    date_label = original_header.DATE{1};
                else
                    date_label = strjoin(original_header.DATE, '_&_');
                end
                date_col.DATE = {convertStringsToChars(date_label)};
                
                % dose (DOSE = 025x_&_3x)
                dose_col = table();
                dose_label = strjoin(original_header.DOSE, '_&_');
                if isempty(chosen_doses_by_id.(id))
                    dose_label = 'N/A';
                end
                dose_col.DOSE = {dose_label};
                
                % Drug / ID
                % Set DRUG to ID, so that we don't have to change the
                % pipeline
                original_header.DRUG = original_header.ID;
                
                
                %Remove the cols that we are adding
                cols_to_remove = {'IMG', 'DATE','DOSE', 'BATCH', 'DRUG_DATE', 'DRUG_EXP', 'SET', 'SET_REP'};
                % Only remove if they exist
                for c = cols_to_remove
                    col = c{1};
                    if any(strcmp(col, original_header.Properties.VariableNames))
                        original_header.(col) = [];
                        %disp(strcat("removed ", col));
                    end
                end
                
        %%% DYNAMICALLY REMOVES NON-UNIQUE VARS; DOESN'T WORK WITH DMSO %%%
                % Remove any cols with more than one unique value %%
%                 for h=original_header.Properties.VariableNames
%                     varname = h{1};
%                     if length(unique(original_header.(varname))) > 1
%                         original_header.(varname) = [];
%                         disp(strcat("Removed var ", varname, " because it is not unique across doses."))
%                     end
%                 end

                % Get unique vals of remaining cols
                try
                    original_header = varfun(@unique, original_header);
                catch
                    error("More columns than expected; remove them at around line 255 in flatten_dose_response.m");
                end
                original_header.Properties.VariableNames = extractAfter(original_header.Properties.VariableNames, 'unique_');
                
                % Get summarized row
                new_row = [original_header date_col img_cols dose_col];
                
                
                % Convert all values to string objects
                new_row = varfun(@convertCharsToStrings, new_row);
                new_row.Properties.VariableNames = extractAfter(new_row.Properties.VariableNames, 'convertCharsToStrings_');
                
                % Add to combined_non_numeric_table
                combined_non_numeric_header = [combined_non_numeric_header; new_row];
            end
                
            % Combine numeric tables
            numeric_tables_cell = struct2cell(numeric_tables);
            combined_numeric_table = horzcat(numeric_tables_cell{:});
            
            % Add non_numeric_cols
            combined_table = [combined_non_numeric_header combined_numeric_table];
            
            % Check if table has no rows, and add to combined table if all
            % good.
            if size(combined_table, 1) <= 0
                msg = strcat("Missing data for [", id, ", ", rep, ", ", exp, "]");
                warning(msg);
                missing_data = strcat(missing_data, " | ", msg);
            else
                % Add to aggregate tables
                combined_tables.(exp).(id).(rep) = combined_table;
                final_combined_table = [final_combined_table; combined_table];
            end
        end
    end
end