Datasets can be downloaded from: https://tinyurl.com/Popgen2025PUZenodo
FASTQ files are the output from sequencing platforms (like Illumina) and typically consist of four lines per read:
@SRR15369215.126490887 # Sequence identifier
GGACCTTCTGTCA... # Nucleotide sequence
+ # Separator
AAFFFJJJJJJJJ... # Base call quality scores
less fastq_data/SRR836370_1_subset.fastq.gz | grep '^@' | wc -l
echo $(( $(zcat fastq_data/SRR836370_1_subset.fastq.gz | wc -l) / 4 ))
less fastq_data/SRR836370_1_subset.fastq.gz | wc -l | awk '{print $1 / 4}'
For all files:
#Strategy 1: count the total number of lines and divide by 4 since all fastq reads are represented by 4 lines as explained
for file in fastq_data/*.fq.gz; do
read_count=$(( $(zcat "$file" | wc -l) / 4 ))
echo "$(basename "$file"): $read_count reads"
done
#Strategy 2: count the number of line with + since all fastq reads have exactly one line with the only character being a single +
less fastq_data/SRR836370_1_subset.fastq.gz | grep "^+$" | wc -l
zcat fastq_data/SRR836370_1_subset.fastq.gz | awk 'NR % 4 == 2 {print length($0)}' | awk '$1<150' | wc -l
For all files:
for file in fastq_data/*.fq.gz; do
reads=$(zcat "$file" | awk 'NR % 4 == 2 {print length($0)}' | awk '$1<150' | wc -l)
echo "$(basename "$file"): $reads reads shorter than 150bp"
done
zcat fastq_data/SRR836370_1_subset.fastq.gz | awk 'NR % 4 == 2 {print length($0)}' | awk '$1>150' | wc -l
For all files:
for file in fastq_data/*.fq.gz; do
reads=$(zcat "$file" | awk 'NR % 4 == 2 {print length($0)}' | awk '$1>150' | wc -l)
echo "$(basename "$file"): $reads reads longer than 150bp"
done
zcat fastq_data/SRR836370_1_subset.fastq.gz | awk 'NR % 4 == 2 {if (length($0) != 150) print length($0)}' | wc -l
For all files:
for file in fastq_data/*.fq.gz; do
count=$(zcat "$file" | awk 'NR % 4 == 2 {if (length($0) != 150) print length($0)}' | wc -l)
echo "$(basename "$file"): $count reads not equal to 150bp"
done
zcat fastq_data/SRR836370_1_subset.fastq.gz | awk 'NR % 4 == 2' | grep -c 'CTGTCTCTTATACACATCT'
# Note that this command looks fro complete match to the adpater sequence. Most reads will only have a partial lenght of the adapter seqeunced
For all files:
for fastq_data/file in *.fq.gz; do
count=$(zcat "$file" | awk 'NR % 4 == 2' | grep -c 'CTGTCTCTTATACACATCT')
echo "$file: $count"
done
Expected: All files should report 0 contamination.