process_hifibr.R 4.52 KB
Newer Older
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
1
2
3
4
5
6
7
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(Biostrings))

# test if there is at least one argument: if not, return an error
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
8
9
10
if (length(args)<2) {
  stop("Usage: Rscript process_hifibr.R input_file.csv outdir", call.=FALSE)
} else if (length(args)==2) {
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
11
  input_file = args[1]
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
12
  out_dir = args[2]
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
}

#input_file="~/Documents/git/sdmmej/test_data/TestData_HiFiBR_Output_mod.csv"
hifibr_input = read.csv(input_file)

## add an id col and col for reclassified CLASS
hifibr_input <- tibble::rowid_to_column(hifibr_input, "ID")
hifibr_input$CLASS_final = ifelse(hifibr_input$CLASS %in% c('deletion','insertion','exact') , hifibr_input$CLASS, NA)
hifibr_input$CLASS_final = ifelse(hifibr_input$`X._OF_READS` < 10 & hifibr_input$CLASS != 'exact', 'filtered',hifibr_input$CLASS)

## names for output
in_string  = basename(input_file)
out_string = gsub(".csv","",in_string)

## check to make sure only one ref, otherwise exit
ref = hifibr_input %>% dplyr::filter(., CLASS == 'exact')
ref_seq = ref$ALIGNED_SEQ

n_ref = nrow(ref)
if ( n_ref != 1){
  print(paste0("ERROR: Found ", n_ref, " reference sequences in Hifibr output."))
  quit(save = "no", status = 1, runLast = FALSE)
}


# separate sequences and filter for length
hifibr_input_filter_del = hifibr_input %>% dplyr::filter(., `X._OF_READS` > 10 & CLASS == 'deletion')
hifibr_input_filter_ins = hifibr_input %>% dplyr::filter(., `X._OF_READS` > 10 & CLASS == 'insertion')
hifibr_input_filter_complex = hifibr_input %>% dplyr::filter(., `X._OF_READS` > 10 & CLASS == 'complex')

# process deletions
deletions = hifibr_input_filter_del$ALIGNED_SEQ

aligned_deletions = c()

for (seq in deletions){
  sigma <- nucleotideSubstitutionMatrix(match = 2, mismatch = -1, baseOnly = TRUE)
  align <- pairwiseAlignment(seq, ref_seq, substitutionMatrix = sigma, gapOpening = -2,
                                       gapExtension = -8, scoreOnly = FALSE)
  align_string <- toString(align@pattern)
  aligned_deletions = c(aligned_deletions,align_string)
}

#print("we have these aligned deletions")
#print(aligned_deletions)

# process insertions
insertions = hifibr_input_filter_ins$ALIGNED_SEQ

#print("we have these insertions")
#print(insertions)

# process complex
comp_unknown = c()

for (row in 1:nrow(hifibr_input_filter_complex)){
  seq = hifibr_input_filter_complex[row, 'ALIGNED_SEQ']
  id = hifibr_input_filter_complex[row,'ID']
  #print(id)
  align <- pairwiseAlignment(seq, ref_seq, substitutionMatrix = sigma, gapOpening = -2,
                             gapExtension = -8, scoreOnly = FALSE)
  
  ins_width = length(width(insertion(align))[[1]])
  del_width = length(width(deletion(align))[[1]])
  nmismatch = nmismatch(align)
  
  if (ins_width == 0 & del_width > 0 & nmismatch == 0){
    align_string <- toString(align@pattern)
    aligned_deletions = c(aligned_deletions, align_string)
    hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'deletion'
    
    #print("found del")
    #print("ref")
    #print(toString(align@subject))
    #print("seq")
    #print(toString(align@pattern))
  }else if (ins_width > 0 & del_width == 0 & nmismatch == 0){
    insertions = c(insertions, seq)
    hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'insertion'
    
    # print("found ins")
    # print("ref")
    # print(toString(align@subject))
    # print("seq")
    # print(toString(align@pattern))
  }else if (ins_width == 0 & del_width == 0 & nmismatch == 1){
    insertions = c(insertions, seq)
    hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'insertion'
    
    # print("found single mismatch ins")
    # print("ref")
    # print(toString(align@subject))
    # print("seq")
    # print(toString(align@pattern))
  }else{
    # print("found complex")
    # print("ref")
    # print(toString(align@subject))
    # print("seq")
    # print(toString(align@pattern))
    comp_unknown = c(comp_unknown, seq)
    hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'complex'
  }
}

## write out
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
119
120
dir.create(out_dir,showWarnings = FALSE)

Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
121
## deletions
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
122
write(aligned_deletions, file = paste0(out_dir,"/", out_string,"_deletion.txt"),append = FALSE, sep = "\n")
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
123
124

## insertions
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
125
126
insertions = c("RECONSTRUCTED_SEQ",insertions)
write(insertions, file = paste0(out_dir,"/",out_string,"_insertion.txt"),append = FALSE, sep = "\n")
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
127
128

## unknown complex
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
129
write(comp_unknown, file = paste0(out_dir,"/", out_string,"_complex.txt"),append = FALSE, sep = "\n")
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
130
131

## output file
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
132
write.csv(hifibr_input, file = paste0(out_dir,"/", out_string,"_reclassified.csv"), row.names = FALSE)