process_hifibr.R 6.17 KB
Newer Older
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
1
2
3
4
5
6
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(Biostrings))

7
# test if there is at least 5 argument: if not, return an error
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
 if (length(args)<5) {
   stop("Usage: Rscript process_hifibr.R input_file.csv outdir search_radius debug", call.=FALSE)
 } else if (length(args)==5) {
   input_file = args[1]
   out_dir = args[2]
   search_radius=as.integer(args[3])
   break_location=as.integer(args[4])
   debug=as.integer(args[5])
 }

## 
#input_file="~/Box/rebecca_documents/sdmmej_data/rerun_inconsistent_14Oct20/PolyA1Seq_reclassified_inconsistent.csv"
#out_dir="~/Box/rebecca_documents/sdmmej_data/rerun_inconsistent_14Oct20/"
#search_radius=30
#break_location=161
#debug=1
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
24

25
26
27
28
29
30
print(paste0("Input file: " ,input_file))
print(paste0("Output dir: ", out_dir))
print(paste0("Search radius: ", search_radius))
print(paste0("Break location: ", break_location))
print(paste0("Debug: ", debug))

Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
31
32
33
34
35
hifibr_input = read.csv(input_file)

## add an id col and col for reclassified CLASS
hifibr_input <- tibble::rowid_to_column(hifibr_input, "ID")
hifibr_input$CLASS_final = ifelse(hifibr_input$CLASS %in% c('deletion','insertion','exact') , hifibr_input$CLASS, NA)
36
hifibr_input$CLASS_final = ifelse(hifibr_input$READS < 10 & hifibr_input$CLASS != 'exact', 'filtered',hifibr_input$CLASS)
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52

## names for output
in_string  = basename(input_file)
out_string = gsub(".csv","",in_string)

## check to make sure only one ref, otherwise exit
ref = hifibr_input %>% dplyr::filter(., CLASS == 'exact')
ref_seq = ref$ALIGNED_SEQ

n_ref = nrow(ref)
if ( n_ref != 1){
  print(paste0("ERROR: Found ", n_ref, " reference sequences in Hifibr output."))
  quit(save = "no", status = 1, runLast = FALSE)
}

# separate sequences and filter for length
53
54
55
hifibr_input_filter_del = hifibr_input %>% dplyr::filter(.,READS >= 10 & CLASS == 'deletion')
hifibr_input_filter_ins = hifibr_input %>% dplyr::filter(.,READS >= 10 & CLASS == 'insertion')
hifibr_input_filter_complex = hifibr_input %>% dplyr::filter(.,READS >= 10 & CLASS == 'complex')
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
56
57
58
59
60
61
62

# process deletions
deletions = hifibr_input_filter_del$ALIGNED_SEQ

aligned_deletions = c()

for (seq in deletions){
63
  sigma <- nucleotideSubstitutionMatrix(match = 2, mismatch = -1, baseOnly = FALSE)
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
64
65
66
67
68
69
  align <- pairwiseAlignment(seq, ref_seq, substitutionMatrix = sigma, gapOpening = -2,
                                       gapExtension = -8, scoreOnly = FALSE)
  align_string <- toString(align@pattern)
  aligned_deletions = c(aligned_deletions,align_string)
}

70
71
72
73
if(debug == 1){
  print("we have these aligned deletions")
  print(aligned_deletions)
}
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
74
75
76
77

# process insertions
insertions = hifibr_input_filter_ins$ALIGNED_SEQ

78
79
80
81
if(debug == 1){
  print("we have these insertions")
  print(insertions)
}
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
82
83
84
85
86
87
# process complex
comp_unknown = c()

for (row in 1:nrow(hifibr_input_filter_complex)){
  seq = hifibr_input_filter_complex[row, 'ALIGNED_SEQ']
  id = hifibr_input_filter_complex[row,'ID']
88
  if(debug == 1){
89
    print(paste0("id is: ",id))
90
91
  }
  sigma <- nucleotideSubstitutionMatrix(match = 2, mismatch = -1, baseOnly = FALSE)
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
92
93
94
  align <- pairwiseAlignment(seq, ref_seq, substitutionMatrix = sigma, gapOpening = -2,
                             gapExtension = -8, scoreOnly = FALSE)
  
95
  if(debug == 1){
96
97
98
99
100
    print("ref")
    print(toString(align@subject))
    print("seq")
    print(toString(align@pattern))
    print("insertion")
101
    print(insertion(align))
102
    print("deletion")
103
    print(deletion(align))
104
    print("mismatch")
105
106
    print(mismatchTable(align))
  }
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
107
  
108
109
110
111
112
113
114
  ins_info = insertion(align)[[1]]
  del_info = deletion(align)[[1]]
  mis_info = mismatchTable(align)
  
  ins_n = length(ins_info)
  ins_pos = start(ins_info)
  ins_r = abs(break_location - ins_pos)
115
  
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
116
    
117
118
119
120
  del_n = length(del_info)
  del_pos = start(del_info)
  del_r = abs(break_location - del_pos)
  
121
122
123
124
125
126
127
128
  if(debug==1){
    print("deln")
    print(del_n)
    print("del_pos")
    print(del_pos)
    print("del_r")
    print(del_r)
  }
129
130
131
132
133
134
135
136
137
138
139
140
141
  mis_n = nmismatch(align)
  mis_pos = mis_info$SubjectStart
  mis_r = abs(break_location - mis_pos)
  
  # check if it's a single deletion within the search radius
  if (ins_n == 0 & mis_n == 0 & del_n ==1 ){
    if (del_r < search_radius){
      align_string <- toString(align@pattern)
      aligned_deletions = c(aligned_deletions, align_string)
      hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'deletion'
      
      if(debug == 1){
        print("found del")
142
143
144
145
      }
    }else{
      if(debug == 1){
        print(paste0("found del outside search radius at ", del_r))
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
      }
    }
  }else if (mis_n == 0 & del_n ==0 & ins_n == 1 ){
    if (ins_r < search_radius){
      insertions = c(insertions, seq)
      hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'insertion'
      
      if(debug == 1){
        print("found ins")
      }
    }else{
      if(debug == 1){
        print("found complex (del outside search radius)")
      }
      comp_unknown = c(comp_unknown, seq)
      hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'complex'
    }
  }else if (del_n ==0 & ins_n == 0 & mis_n == 1){
    if (mis_r < search_radius){
      insertions = c(insertions, seq)
      hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'insertion'
      
      if(debug == 1){
        print("found single mismatch ins")
      }
    }else{
      if(debug == 1){
        print("found complex (ins outside search radius)")
      }
      comp_unknown = c(comp_unknown, seq)
      hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'complex'
    }
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
178
  }else{
179
180
181
    if(debug == 1){
      print("found complex")
    }
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
182
183
184
185
186
187
    comp_unknown = c(comp_unknown, seq)
    hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'complex'
  }
}

## write out
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
188
189
dir.create(out_dir,showWarnings = FALSE)

Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
190
## deletions
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
191
write(aligned_deletions, file = paste0(out_dir,"/", out_string,"_deletion.txt"),append = FALSE, sep = "\n")
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
192
193

## insertions
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
194
195
insertions = c("RECONSTRUCTED_SEQ",insertions)
write(insertions, file = paste0(out_dir,"/",out_string,"_insertion.txt"),append = FALSE, sep = "\n")
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
196
197

## unknown complex
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
198
write(comp_unknown, file = paste0(out_dir,"/", out_string,"_complex.txt"),append = FALSE, sep = "\n")
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
199
200

## output file
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
201
write.csv(hifibr_input, file = paste0(out_dir,"/", out_string,"_reclassified.csv"), row.names = FALSE)
202