########################################################################
# DADA2 pipeline for 16S rRNA gene amplicon analysis
#
# Project title : Soil Salinity Alters Acemannan Content and Sodium Uptake 
#                 in Aloe barbadensis Miller plants. A Role for 
#                 Root-Associated Microbial Communities?
#
# Authors       : Nikolaou et al.
# Script ver.   : 1.0 (September 2025)
# Environment   : R (version: 4.3.1), dada2 (version: 1.28)
#
# Notes         :
# - EDIT THE THREE PATHS BELOW (path_fastq, silva_train, silva_species).
# - Paired-end filenames assumed to contain "_16S_1" (forward) and "_16S_2" (reverse).
# - Adjust truncLen/maxEE to your read length & quality profiles.
########################################################################

## (Optional) Minimal dependency installation ---------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
# BiocManager::install("dada2")   # core
# install.packages("ggplot2")     # for plotting

## 0) Setup -------------------------------------------------------------
suppressPackageStartupMessages({
  library(dada2)
  library(ggplot2)
})

set.seed(42)
mythreads <- max(1, parallel::detectCores() - 1)

# >>>>>>>>>>>>>>>>>>>>>>>>>>>> EDIT THESE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
path_fastq   <- "Aloe_16S"  # folder with demultiplexed .fastq(.gz)
silva_train  <- "path/to/silva_nr99_v138.1_train_set.fa.gz"
silva_species<- "path/to/silva_species_assignment_v138.1.fa.gz"
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

stopifnot(dir.exists(path_fastq))
stopifnot(file.exists(silva_train), file.exists(silva_species))

# Outputs
out_dir   <- "dada2_outputs"
filt_dir  <- file.path(out_dir, "filtered")
plots_dir <- file.path(out_dir, "plots")
dir.create(out_dir,   showWarnings = FALSE, recursive = TRUE)
dir.create(filt_dir,  showWarnings = FALSE, recursive = TRUE)
dir.create(plots_dir, showWarnings = FALSE, recursive = TRUE)

# Input files (paired-end)
fnFs <- sort(list.files(path_fastq, pattern = "_16S_1", full.names = TRUE))
fnRs <- sort(list.files(path_fastq, pattern = "_16S_2", full.names = TRUE))
stopifnot(length(fnFs) == length(fnRs), length(fnFs) > 0)

# Sample names: everything before "_16S"
sample.names <- sapply(strsplit(basename(fnFs), "_16S"), `[`, 1)

## 1) Raw quality profiles ----------------------------------------------
pdf(file = file.path(plots_dir, "01_seq_quality_profiles_raw.pdf"),
    height = 4, width = 8, onefile = TRUE)
for(i in seq_along(fnFs)){
  message(sprintf("Quality (raw) %d/%d: %s", i, length(fnFs), sample.names[i]))
  print(plotQualityProfile(c(fnFs[i], fnRs[i])))
}
dev.off()

## 2) Filtering & trimming ----------------------------------------------
filtFs <- file.path(filt_dir, paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(filt_dir, paste0(sample.names, "_R_filt.fastq.gz"))

# Typical PE settings; tune for your data (see quality plots)
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs,
                     maxN = 0,
                     maxEE = c(2, 2),
                     truncQ = 2,
                     truncLen = c(250, 200),
                     rm.phix = TRUE,
                     compress = TRUE,
                     multithread = mythreads,
                     matchIDs = TRUE)

write.table(out, file = file.path(out_dir, "02_filterAndTrim_summary.tsv"),
            sep = "\t", quote = FALSE, col.names = NA)

pdf(file = file.path(plots_dir, "02_seq_quality_profiles_filtered.pdf"),
    height = 4, width = 8, onefile = TRUE)
for(i in seq_along(filtFs)){
  message(sprintf("Quality (filt) %d/%d: %s", i, length(filtFs), sample.names[i]))
  print(plotQualityProfile(c(filtFs[i], filtRs[i])))
}
dev.off()

## 3) Learn error models -------------------------------------------------
errF <- learnErrors(filtFs, multithread = mythreads)
errR <- learnErrors(filtRs, multithread = mythreads)

pdf(file = file.path(plots_dir, "03_error_model_F.pdf"), height = 5, width = 5)
plotErrors(errF, nominalQ = TRUE)
dev.off()
pdf(file = file.path(plots_dir, "03_error_model_R.pdf"), height = 5, width = 5)
plotErrors(errR, nominalQ = TRUE)
dev.off()

## 4) Denoise & merge ----------------------------------------------------
names(filtFs) <- sample.names
names(filtRs) <- sample.names

dadaFs <- dada(filtFs, err = errF, multithread = mythreads)
dadaRs <- dada(filtRs, err = errR, multithread = mythreads)

mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose = TRUE)

## 5) Sequence table & chimera removal ----------------------------------
seqtab <- makeSequenceTable(mergers)
seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus",
                                    multithread = mythreads, verbose = TRUE)

retained_fraction <- sum(seqtab.nochim) / sum(seqtab)
writeLines(sprintf("Retained non-chimeric reads fraction: %.3f", retained_fraction),
           con = file.path(out_dir, "05_retained_fraction.txt"))

## 6) Read tracking ------------------------------------------------------
getN <- function(x) { sum(getUniques(x)) }
track <- cbind(out,
               denoisedF = sapply(dadaFs, getN),
               denoisedR = sapply(dadaRs, getN),
               merged    = sapply(mergers, getN),
               nonchim   = rowSums(seqtab.nochim))
colnames(track)[1:2] <- c("input", "filtered")
rownames(track) <- sample.names

write.table(track, file = file.path(out_dir, "06_read_tracking.tsv"),
            sep = "\t", quote = FALSE, col.names = NA)

## 7) Taxonomic assignment (SILVA v138.1) --------------------------------
taxa <- assignTaxonomy(seqtab.nochim,
                       refFasta   = silva_train,
                       minBoot    = 80,
                       multithread= mythreads,
                       tryRC      = TRUE)

taxa <- addSpecies(taxa, refFasta = silva_species)

# Save core objects
saveRDS(seqtab.nochim, file = file.path(out_dir, "07_seqtab_nochim.rds"))
saveRDS(taxa,          file = file.path(out_dir, "07_taxa_matrix.rds"))

## 8) Optional: remove non-targets (Chloroplast/Mitochondria/Eukaryota) --
# Columns expected: Kingdom, Phylum, Class, Order, Family, Genus, Species
is.chloro <- taxa[, "Order"]  %in% "Chloroplast"
seqtab.nochloro <- seqtab.nochim[, !is.chloro, drop = FALSE]
taxa.nochloro   <- taxa[!is.chloro, , drop = FALSE]

is.mito <- taxa.nochloro[, "Family"] %in% "Mitochondria"
seqtab.nomito <- seqtab.nochloro[, !is.mito, drop = FALSE]
taxa.nomito   <- taxa.nochloro[!is.mito, , drop = FALSE]

is.euk <- taxa.nomito[, "Kingdom"] %in% "Eukaryota"
seqtab.16S <- seqtab.nomito[, !is.euk, drop = FALSE]
taxa.16S   <- taxa.nomito[!is.euk, , drop = FALSE]

# Remove unassigned at Kingdom safely (NA check)
is.na.kingdom <- is.na(taxa.16S[, "Kingdom"])
seqtab.16S <- seqtab.16S[, !is.na.kingdom, drop = FALSE]
taxa.16S   <- taxa.16S[!is.na.kingdom, , drop = FALSE]

capture.output({
  cat("Dimensions after removing non-targets:\n")
  print(dim(seqtab.16S))
  cat("\nTaxonomy columns:\n")
  print(colnames(taxa.16S))
}, file = file.path(out_dir, "08_non_targets_summary.txt"))

## 9) Export ASV fasta / counts / taxonomy -------------------------------
asv_seqs <- colnames(seqtab.16S)
asv_headers <- sprintf(">ASV_%s", seq_along(asv_seqs))

# FASTA
asv_fasta <- as.vector(rbind(asv_headers, asv_seqs))
write(asv_fasta, file = file.path(out_dir, "09_ASVs.fa"))

# Counts (ASVs as rows, samples as cols)
asv_tab <- t(seqtab.16S)
row.names(asv_tab) <- sub("^>", "", asv_headers)
write.table(asv_tab, file = file.path(out_dir, "09_ASVs_counts.tsv"),
            sep = "\t", quote = FALSE, col.names = NA)

# Taxonomy (same rownames as counts)
rownames(taxa.16S) <- row.names(asv_tab)
write.table(taxa.16S, file = file.path(out_dir, "09_ASVs_taxonomy.tsv"),
            sep = "\t", quote = FALSE, col.names = NA)

## 10) Session info ------------------------------------------------------
sink(file = file.path(out_dir, "10_sessionInfo.txt"))
sessionInfo()
sink()

########################################################################
# End of script
########################################################################