Commit 57200512 authored by Rebecca E Batorsky's avatar Rebecca E Batorsky
Browse files

added complex to Insertion script processing and modified readme file

parent 322903c6
......@@ -54,7 +54,10 @@ This generates output files in a directory `PolyA1Seq_testdata_output` with the
`PolyA1Seq_testdata_deletion_consistency_log.txt`
`PolyA1Seq_testdata_deletion_consistency_table.txt`
- Outputs from Insertoin script
`PolyA1Seq_testdatatestdata_insertion_consistency2.csv`
`PolyA1Seq_testdatatestdata_insertion_consistency_long2.csv`
`PolyA1Seq_testdatatestdata_insertion_alignment2.csv`
- Outputs from Insertionn script, which is run on both insertions and complex separately
`PolyA1Seq_testdata_insertion_insertion_consistency2.csv`
`PolyA1Seq_testdata_insertion_insertion_consistency_long2.csv`
`PolyA1Seq_testdata_insertion_insertion_alignment2.csv`
`PolyA1Seq_testdata_complex_insertion_consistency2.csv`
`PolyA1Seq_testdata_complex_insertion_consistency_long2.csv`
`PolyA1Seq_testdata_complex_insertion_alignment2.csv`
\ No newline at end of file
......@@ -28,7 +28,7 @@ if (length(args)<5) {
}
a<-read.csv(insertion_in)
a_initial<-read.csv(insertion_in)
## get reference
hifibr_input = read.csv(hifi_in,header=T)
......@@ -47,8 +47,13 @@ ref = ref$ALIGNED_SEQ
n <- search_radius # number of bases to the left and right of the break you want to search of repeated motifs
p <- 10 # number of bases to the left and right of the initial repeat motif to search for homology
### for naming the output files
#plasmid <- "test_data_m6"
plasmid <- gsub("_reclassified.csv","",basename(hifi_in))
type <- gsub(plasmid, "", basename(insertion_in))
type <- gsub(".txt", "", type)
type <- gsub("_", "", type)
#L <- "GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTAT"
L = substr(ref,0,nick)
......@@ -68,21 +73,55 @@ sL1 <- substring(L, 1, (l-k1-1):l)
sR1 <- substring(R, 1:(r-k2-1),r)
a2=NULL # create empty vector to insert left del boundary
for (i in a[, 1]){
a3=NULL # create empty vector to insert RIGHT del boundary
## this loop checks to see if sequences meet the criteria
keep_seq=c()
for (n in 1:nrow(a_initial)){
i=a_initial[n, 1]
lbb_found=0
rbb_found=0
lb <- str_locate(as.character(i), sL1)
lb <- na.omit(lb)
lbb <- lb[nrow(lb),2]
a2[i] = lbb
if (length(lbb) > 0){
lbb_found=1
}
rb <- str_locate(as.character(i), sR1)
rb <- na.omit(rb)
rbb <- rb[1,1]
if (length(rbb)>0){
rbb_found=1
}
if ((lbb_found & rbb_found)){
keep_seq=c(keep_seq,i)
}else{
print("skipping seq that doesn't match any L or R")
print(i)
}
}
a=data.frame("RECONSTRUCTED_SEQ" = keep_seq)
for (n in 1:nrow(a)){
a3=NULL # create empty vector to insert RIGHT del boundary
for (i in a[, 1]){
i=a[n, 1]
lb <- str_locate(as.character(i), sL1)
lb <- na.omit(lb)
lbb <- lb[nrow(lb),2]
rb <- str_locate(as.character(i), sR1)
rb <- na.omit(rb)
rbb <- rb[1,1]
a2[i] = lbb
a3[i] = rbb
}
a4 <- cbind(as.data.frame(a2), as.data.frame(a3)) # combine left and right del boundary
a5 <- cbind(a, a4) # combine seq and del boundary
names(a5) <- c("RECONSTRUCTED_SEQ","left_del", "right_del") # rename columns for ease
......@@ -407,9 +446,9 @@ test2 <- test[!duplicated(test[,16]),]
pretty <- cbind(ID=test2[,1], as.data.frame(test2[,15]), as.data.frame(test2[,14]))
colnames(pretty)[2:3] <- c("insertion_alignment", "mechanism")
output1 <- paste0(out_dir, "/", plasmid, "testdata_insertion_consistency2.csv")
output2 <- paste0(out_dir, "/", plasmid, "testdata_insertion_consistency_long2.csv")
output3 <- paste0(out_dir, "/", plasmid, "testdata_insertion_alignment2.csv")
output1 <- paste0(out_dir, "/", plasmid, "_",type,"_insertion_consistency2.csv")
output2 <- paste0(out_dir, "/", plasmid, "_",type,"_insertion_consistency_long2.csv")
output3 <- paste0(out_dir, "/", plasmid, "_",type,"_insertion_alignment2.csv")
write.csv(master2, output1)
write.csv(test2, output2)
write.csv(pretty, output3)
......@@ -197,6 +197,7 @@ insertions = c("RECONSTRUCTED_SEQ",insertions)
write(insertions, file = paste0(out_dir,"/",out_string,"_insertion.txt"),append = FALSE, sep = "\n")
## unknown complex
comp_unknown = c("RECONSTRUCTED_SEQ",comp_unknown)
write(comp_unknown, file = paste0(out_dir,"/", out_string,"_complex.txt"),append = FALSE, sep = "\n")
## output file
......
......@@ -23,6 +23,7 @@ mkdir -p ${results_dir}
hifi_reclass=${results_dir}/${bn}_reclassified.csv
deletion_out=${results_dir}/${bn}_deletion.txt
insertion_out=${results_dir}/${bn}_insertion.txt
complex_out=${results_dir}/${bn}_complex.txt
echo ${hifi_reclass}
......@@ -44,14 +45,26 @@ echo "------"
echo "Done deletion script"
echo "------"
echo "------"
echo "Starting insertion consistency script"
echo "Starting insertion consistency script on insertions"
echo "------"
cd ../insertion/
Rscript INSERTION_PROGRAM.R ${hifi_reclass} ${insertion_out} $results_dir $breakpoint $search_radius
echo "------"
echo "Done insertion script"
echo "Done insertion script on insertions"
echo "------"
echo "------"
echo "Starting insertion consistency script on complex"
echo "------"
cd ../insertion/
Rscript INSERTION_PROGRAM.R ${hifi_reclass} ${complex_out} $results_dir $breakpoint $search_radius
echo "------"
echo "Done insertion script on complex"
echo "------"
echo "Results will be located in ${results_dir}"
......@@ -4,4 +4,4 @@ AATTGGAAAA_CTCATGTCCT,325M,325,325M,150,164,-11,0,-11,0,-11,150,164,11,NCTGTTATC
AATTGGAAAA_CTCATGTCCT,160M3D162M,322,160M3D162M,160,161,-1,-3,-1,-3,-4,160,161,1,A,complex,GATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGAGGGAAAAAATTCGTACTTTGGAGTACGAAAATTGGAAAATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTATCCAAGGCCAAACAGGCCGGCGCCTCCTAACGATCCTCTAGCTCATGTCCTGAACGTTAACGTTAACGTAACGTTAACTCGAGGCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACC,40,,NA,1,2.5
AATTGGAAAA_CTCATGTCCT,117M1I208M,326,117M1I208M,121,208,-40,44,-40,0,-40,121,164,41,ATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTATCCC,complex,GATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGAGGGAAAAAATTCGTACTTTGGAGTACGAAAATTGGAAAAATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTATCCCTAGAGGCCAAACAGGCCGGCGCCTCCTAACGATCCTCTAGCTCATGTCCTGAACGTTAACGTTAACGTAACGTTAACTCGAGGCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACC,15,,,1,0
AATTGGAAAA_CTCATGTCCT,157M2D166M,323,157M2D166M,157,166,-4,2,-4,0,-4,157,164,2,CC,complex,GATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGAGGGAAAAAATTCGTACTTTGGAGTACGAAAATTGGAAAATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTACCTAGAGGCCAAACAGGCCGGCGCCTCCTAACGATCCTCTAGCTCATGTCCTGAACGTTAACGTTAACGTAACGTTAACTCGAGGCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACC,10,,NA,1,0
AATTGGAAAA_CTCATGTCCT,161M3D161M,322,161M3D161M,161,161,0,-3,0,-3,-3,161,161,0,,deletion,GATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGAGGGAAAAAATTCGTACTTTGGAGTACGAAAATTGGAAAATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTATCCCAGGCCAAACAGGCCGGCGCCTCCTAACGATCCTCTAGCTCATGTCCTGAACGTTAACGTTAACGTAACGTTAACTCGAGGCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACC,10,,0,1,0
AATTGGAAAA_CTCATGTCCT,161M3D161M,322,161M3D161M,161,161,0,-3,0,-3,-3,161,161,0,,deletion,GATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGAGGGAAAAAATTCGTACTTTGGAGTACGAAAATTGGAAAATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTATCCCAGGCCAAACAGGCCGGCGCCTCCTAACGATCCTCTAGCTCATGTCCTGAACGTTAACGTTAACGTAACGTTAACTCGAGGCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACC,10,,0,1,0
\ No newline at end of file
RECONSTRUCTED_SEQ
GATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGAGGGAAAAAATTCGTACTTTGGAGTACGAAAATTGGAAAATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTATCCAAGGCCAAACAGGCCGGCGCCTCCTAACGATCCTCTAGCTCATGTCCTGAACGTTAACGTTAACGTAACGTTAACTCGAGGCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACC
GATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGAGGGAAAAAATTCGTACTTTGGAGTACGAAAATTGGAAAAATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTATCCCTAGAGGCCAAACAGGCCGGCGCCTCCTAACGATCCTCTAGCTCATGTCCTGAACGTTAACGTTAACGTAACGTTAACTCGAGGCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACC
"","ID","insertion_alignment","mechanism"
"1","PolyA1Seq_testdata-1","GATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGAGGGAAAAAATTCGTACTTTGGAGTACGAAAATTGGAAAATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTATCCAAGGCCAAACAGGCCGGCGCCTCCTAACGATCCTCTAGCTCATGTCCTGAACGTTAACGTTAACGTAACGTTAACTCGAGGCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACC","seq"
"2","PolyA1Seq_testdata-1","0","Loop-out"
"3","PolyA1Seq_testdata-1","0","Snap-back"
"","ID","DR_START","DR_END","RC_START","RC_END","consistency","RECONSTRUCTED_SEQ","left_del","right_del","del_seq","insertion","plasmid","DRmotif_length","RCmotif_length","Loop-out","Snap-back"
"1","PolyA1Seq_testdata-1",NA,NA,NA,NA,"FALSE","GATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGAGGGAAAAAATTCGTACTTTGGAGTACGAAAATTGGAAAATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTATCCAAGGCCAAACAGGCCGGCGCCTCCTAACGATCCTCTAGCTCATGTCCTGAACGTTAACGTTAACGTAACGTTAACTCGAGGCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACC",160,162,"GATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGAGGGAAAAAATTCGTACTTTGGAGTACGAAAATTGGAAAATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTATCCAGGCCAAACAGGCCGGCGCCTCCTAACGATCCTCTAGCTCATGTCCTGAACGTTAACGTTAACGTAACGTTAACTCGAGGCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACC","A","PolyA1Seq_testdata",NA,NA,"0","0"
"","ID","DR_START","DR_END","RC_START","RC_END","consistency","left_del","right_del","del_seq","insertion","plasmid","DRmotif_length","RCmotif_length","mechanism","insertion_alignment","unicorn"
"1","PolyA1Seq_testdata-1",NA,NA,NA,NA,"FALSE",160,162,"GATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGAGGGAAAAAATTCGTACTTTGGAGTACGAAAATTGGAAAATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTATCCAGGCCAAACAGGCCGGCGCCTCCTAACGATCCTCTAGCTCATGTCCTGAACGTTAACGTTAACGTAACGTTAACTCGAGGCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACC","A","PolyA1Seq_testdata",NA,NA,"seq","GATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGAGGGAAAAAATTCGTACTTTGGAGTACGAAAATTGGAAAATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTATCCAAGGCCAAACAGGCCGGCGCCTCCTAACGATCCTCTAGCTCATGTCCTGAACGTTAACGTTAACGTAACGTTAACTCGAGGCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACC","PolyA1Seq_testdata-1-seq-GATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGAGGGAAAAAATTCGTACTTTGGAGTACGAAAATTGGAAAATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTATCCAAGGCCAAACAGGCCGGCGCCTCCTAACGATCCTCTAGCTCATGTCCTGAACGTTAACGTTAACGTAACGTTAACTCGAGGCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACC"
"2","PolyA1Seq_testdata-1",NA,NA,NA,NA,"FALSE",160,162,"GATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGAGGGAAAAAATTCGTACTTTGGAGTACGAAAATTGGAAAATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTATCCAGGCCAAACAGGCCGGCGCCTCCTAACGATCCTCTAGCTCATGTCCTGAACGTTAACGTTAACGTAACGTTAACTCGAGGCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACC","A","PolyA1Seq_testdata",NA,NA,"Loop-out","0","PolyA1Seq_testdata-1-Loop-out-0"
"3","PolyA1Seq_testdata-1",NA,NA,NA,NA,"FALSE",160,162,"GATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGATCCTAGGAGGGAAAAAATTCGTACTTTGGAGTACGAAAATTGGAAAATAGAGCAGCACTCACCTTATTGTCATTACCCTGTTATCCAGGCCAAACAGGCCGGCGCCTCCTAACGATCCTCTAGCTCATGTCCTGAACGTTAACGTTAACGTAACGTTAACTCGAGGCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACCCCAGGACC","A","PolyA1Seq_testdata",NA,NA,"Snap-back","0","PolyA1Seq_testdata-1-Snap-back-0"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment