process_hifibr.R 4.52 KB
Newer Older
Rebecca E Batorsky's avatar
Rebecca E Batorsky committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(Biostrings))

# test if there is at least one argument: if not, return an error
if (length(args)==0) {
  stop("Usage: Rscript process_hifibr.R input_file.csv", call.=FALSE)
} else if (length(args)==1) {
  input_file = args[1]
}

#input_file="~/Documents/git/sdmmej/test_data/TestData_HiFiBR_Output_mod.csv"
hifibr_input = read.csv(input_file)

## add an id col and col for reclassified CLASS
hifibr_input <- tibble::rowid_to_column(hifibr_input, "ID")
hifibr_input$CLASS_final = ifelse(hifibr_input$CLASS %in% c('deletion','insertion','exact') , hifibr_input$CLASS, NA)
hifibr_input$CLASS_final = ifelse(hifibr_input$`X._OF_READS` < 10 & hifibr_input$CLASS != 'exact', 'filtered',hifibr_input$CLASS)

## names for output
in_string  = basename(input_file)
out_string = gsub(".csv","",in_string)

## check to make sure only one ref, otherwise exit
ref = hifibr_input %>% dplyr::filter(., CLASS == 'exact')
ref_seq = ref$ALIGNED_SEQ

n_ref = nrow(ref)
if ( n_ref != 1){
  print(paste0("ERROR: Found ", n_ref, " reference sequences in Hifibr output."))
  quit(save = "no", status = 1, runLast = FALSE)
}


# separate sequences and filter for length
hifibr_input_filter_del = hifibr_input %>% dplyr::filter(., `X._OF_READS` > 10 & CLASS == 'deletion')
hifibr_input_filter_ins = hifibr_input %>% dplyr::filter(., `X._OF_READS` > 10 & CLASS == 'insertion')
hifibr_input_filter_complex = hifibr_input %>% dplyr::filter(., `X._OF_READS` > 10 & CLASS == 'complex')

# process deletions
deletions = hifibr_input_filter_del$ALIGNED_SEQ

aligned_deletions = c()

for (seq in deletions){
  sigma <- nucleotideSubstitutionMatrix(match = 2, mismatch = -1, baseOnly = TRUE)
  align <- pairwiseAlignment(seq, ref_seq, substitutionMatrix = sigma, gapOpening = -2,
                                       gapExtension = -8, scoreOnly = FALSE)
  align_string <- toString(align@pattern)
  aligned_deletions = c(aligned_deletions,align_string)
}

#print("we have these aligned deletions")
#print(aligned_deletions)

# process insertions
insertions = hifibr_input_filter_ins$ALIGNED_SEQ

#print("we have these insertions")
#print(insertions)

# process complex
comp_unknown = c()

for (row in 1:nrow(hifibr_input_filter_complex)){
  seq = hifibr_input_filter_complex[row, 'ALIGNED_SEQ']
  id = hifibr_input_filter_complex[row,'ID']
  #print(id)
  align <- pairwiseAlignment(seq, ref_seq, substitutionMatrix = sigma, gapOpening = -2,
                             gapExtension = -8, scoreOnly = FALSE)
  
  ins_width = length(width(insertion(align))[[1]])
  del_width = length(width(deletion(align))[[1]])
  nmismatch = nmismatch(align)
  
  if (ins_width == 0 & del_width > 0 & nmismatch == 0){
    align_string <- toString(align@pattern)
    aligned_deletions = c(aligned_deletions, align_string)
    hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'deletion'
    
    #print("found del")
    #print("ref")
    #print(toString(align@subject))
    #print("seq")
    #print(toString(align@pattern))
  }else if (ins_width > 0 & del_width == 0 & nmismatch == 0){
    insertions = c(insertions, seq)
    hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'insertion'
    
    # print("found ins")
    # print("ref")
    # print(toString(align@subject))
    # print("seq")
    # print(toString(align@pattern))
  }else if (ins_width == 0 & del_width == 0 & nmismatch == 1){
    insertions = c(insertions, seq)
    hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'insertion'
    
    # print("found single mismatch ins")
    # print("ref")
    # print(toString(align@subject))
    # print("seq")
    # print(toString(align@pattern))
  }else{
    # print("found complex")
    # print("ref")
    # print(toString(align@subject))
    # print("seq")
    # print(toString(align@pattern))
    comp_unknown = c(comp_unknown, seq)
    hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'complex'
  }
}

## write out
## deletions
write(aligned_deletions, file = paste0("~/Documents/git/sdmmej/test_data/output/",out_string,"_deletion.txt"),append = FALSE, sep = "\n")

## insertions
write(insertions, file = paste0("~/Documents/git/sdmmej/test_data/output/",out_string,"_insertion.txt"),append = FALSE, sep = "\n")

## unknown complex
write(comp_unknown, file = paste0("~/Documents/git/sdmmej/test_data//output/",out_string,"_complex.txt"),append = FALSE, sep = "\n")

## output file
write.csv(hifibr_input, file = paste0("~/Documents/git/sdmmej/test_data/output/",out_string,"_reclassified.csv"), row.names = FALSE)