Datasets can be downloaded from: Zenodo Record 14258052
FASTQ files are the output from sequencing platforms (like Illumina) and typically consist of four lines per read:
@SRR15369215.126490887 # Sequence identifier
GGACCTTCTGTCA... # Nucleotide sequence
+ # Separator
AAFFFJJJJJJJJ... # Base call quality scores
After 100-150 base pairs, the accuracy of base calling decreases. Trimming is usually done to retain only high-quality bases.
for file in *\?download=1; do
newname=$(echo "$file" | sed "s/?download=1//")
mv "$file" "$newname"
done
less LGS1_sub_1.fq.gz | grep '^@' | wc -l
echo $(( $(zcat LGS1_sub_1.fq.gz | wc -l) / 4 ))
less LGS1_sub_1.fq.gz | wc -l | awk '{print $1 / 4}'
For all files:
for file in *.fq.gz; do
read_count=$(( $(zcat "$file" | wc -l) / 4 ))
echo "$(basename "$file"): $read_count reads"
done
zcat BEN_CI16_sub_1.fq.gz | awk 'NR % 4 == 2 {print length($0)}' | awk '$1<150' | wc -l
For all files:
for file in *.fq.gz; do
reads=$(zcat "$file" | awk 'NR % 4 == 2 {print length($0)}' | awk '$1<150' | wc -l)
echo "$(basename "$file"): $reads reads shorter than 150bp"
done
zcat BEN_CI16_sub_1.fq.gz | awk 'NR % 4 == 2 {print length($0)}' | awk '$1>150' | wc -l
For all files:
for file in *.fq.gz; do
reads=$(zcat "$file" | awk 'NR % 4 == 2 {print length($0)}' | awk '$1>150' | wc -l)
echo "$(basename "$file"): $reads reads longer than 150bp"
done
zcat BEN_CI16_sub_1.fq.gz | awk 'NR % 4 == 2 {if (length($0) != 150) print length($0)}' | wc -l
For all files:
for file in *.fq.gz; do
count=$(zcat "$file" | awk 'NR % 4 == 2 {if (length($0) != 150) print length($0)}' | wc -l)
echo "$(basename "$file"): $count reads not equal to 150bp"
done
zcat BEN_CI16_sub_1.fq.gz | awk 'NR % 4 == 2' | grep -c 'CTGTCTCTTATACACATCT'
For all files:
for file in *.fq.gz; do
count=$(zcat "$file" | awk 'NR % 4 == 2' | grep -c 'CTGTCTCTTATACACATCT')
echo "$file: $count"
done
Expected: All files should report 0 contamination.