SeqAn3 3.3.0-rc.1
The Modern C++ library for sequence analysis.
sequence_file/input.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <cassert>
16#include <filesystem>
17#include <fstream>
18#include <string>
19#include <variant>
20#include <vector>
21
41
42namespace seqan3
43{
44
45// ----------------------------------------------------------------------------
46// sequence_file_input_traits
47// ----------------------------------------------------------------------------
48
97template <typename t>
99 requires (t v) {
104
107
110 };
112
113// ----------------------------------------------------------------------------
114// sequence_file_input_default_traits
115// ----------------------------------------------------------------------------
116
133{
141
144
146 template <typename _sequence_alphabet>
148
150 using id_alphabet = char;
151
153 template <typename _id_alphabet>
155
158
160 template <typename _quality_alphabet>
162
164};
165
169{
177
181};
182
183// ----------------------------------------------------------------------------
184// sequence_file_input
185// ----------------------------------------------------------------------------
186
209{
210public:
216 using traits_type = traits_type_;
218 using selected_field_ids = selected_field_ids_;
220 using valid_formats = valid_formats_;
222 using stream_char_type = char;
224
229
230 static_assert(
231 []() constexpr {
232 for (field f : selected_field_ids::as_array)
233 if (!field_ids::contains(f))
234 return false;
235 return true;
236 }(),
237 "You selected a field that is not valid for sequence files, please refer to the documentation "
238 "of sequence_file_input::field_ids for the accepted values.");
239
248 using id_type = typename traits_type::template id_container<typename traits_type::id_alphabet>;
250 using quality_type = typename traits_type::template quality_container<typename traits_type::quality_alphabet>;
253
258
268 using const_reference = void;
270 using size_type = size_t;
276 using const_iterator = void;
278 using sentinel = std::default_sentinel_t;
280
296
314 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
316 {
317 primary_stream->rdbuf()->pubsetbuf(stream_buffer.data(), stream_buffer.size());
319 ->open(filename, std::ios_base::in | std::ios::binary);
320
321 if (!primary_stream->good())
322 throw file_open_error{"Could not open file " + filename.string() + " for reading."};
323
324 // possibly add intermediate compression stream
326
327 // initialise format handler or throw if format is not found
328 using format_variant_t =
329 typename detail::variant_from_tags<valid_formats, detail::sequence_file_input_format_exposer>::type;
330 format_variant_t format_variant{};
331 detail::set_format(format_variant, filename);
332
334 [&](auto && selected_format)
335 {
336 using format_t = std::remove_cvref_t<decltype(selected_format)>;
337 format = std::make_unique<selected_sequence_format<format_t>>();
338 },
339 format_variant);
340 }
341 /* NOTE(h-2): Curiously we do not need a user-defined deduction guide for the above constructor.
342 * A combination of default template parameters and auto-deduction guides works as expected,
343 * independent of whether the second/optional parameter is specified or not, i.e. it is possible
344 * to auto-deduct and overwrite a single template parameter out of the four if the optional parameter
345 * is specified and use the default otherwise.
346 */
347
362 template <input_stream stream_t, sequence_file_input_format file_format>
363 requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
364 sequence_file_input(stream_t & stream,
365 file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
366 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
368 format{std::make_unique<selected_sequence_format<file_format>>()}
369 {
370 static_assert(list_traits::contains<file_format, valid_formats>,
371 "You selected a format that is not in the valid_formats of this file.");
372
373 // possibly add intermediate compression stream
375 }
376
378 template <input_stream stream_t, sequence_file_input_format file_format>
379 requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
380 sequence_file_input(stream_t && stream,
381 file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
382 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
383 primary_stream{new stream_t{std::move(stream)}, stream_deleter_default},
384 format{std::make_unique<selected_sequence_format<file_format>>()}
385 {
386 static_assert(list_traits::contains<file_format, valid_formats>,
387 "You selected a format that is not in the valid_formats of this file.");
388
389 // possibly add intermediate compression stream
391 }
393
413 {
414 // buffer first record
416 {
419 }
420
421 return {*this};
422 }
423
437 sentinel end() noexcept
438 {
439 return {};
440 }
441
465 reference front() noexcept
466 {
467 return *begin();
468 }
470
475
476protected:
478
488
497 {}
500 {
501 delete ptr;
502 }
503
508
512 bool at_end{false};
514
515private:
518 {
519 // clear the record
521
522 // at end if we could not read further
525 {
526 at_end = true;
527 return;
528 }
529
531 }
532
544 {
553 virtual ~sequence_format_base() = default;
555
567 virtual void read_sequence_record(std::istream & instream,
571 };
572
584 template <typename format_t>
586 {
597
603 {
604 // read new record
605 {
607 options,
609 detail::get_or_ignore<field::seq>(record_buffer),
610 detail::get_or_ignore<field::id>(record_buffer),
611 detail::get_or_ignore<field::qual>(record_buffer));
612 }
613 }
614
617 };
618
621
623 friend iterator;
624};
625
632template <input_stream stream_type, sequence_file_input_format file_format>
633sequence_file_input(stream_type & stream,
634 file_format const &)
636 typename sequence_file_input<>::selected_field_ids, // default field ids.
638
640template <input_stream stream_type, sequence_file_input_format file_format>
641sequence_file_input(stream_type && stream,
642 file_format const &)
644 typename sequence_file_input<>::selected_field_ids, // default field ids.
646
648template <input_stream stream_type,
649 sequence_file_input_format file_format,
651sequence_file_input(stream_type && stream,
652 file_format const &,
653 selected_field_ids const &)
657
659template <input_stream stream_type,
660 sequence_file_input_format file_format,
662sequence_file_input(stream_type & stream,
663 file_format const &,
664 selected_field_ids const &)
669
670} // namespace seqan3
Provides seqan3::aa27, container aliases and string literals.
Provides alphabet adaptations for standard char types.
The twenty-seven letter amino acid alphabet..
Definition: aa27.hpp:46
Input iterator necessary for providing a range-like interface in input file.
Definition: in_file_iterator.hpp:41
The 15 letter DNA alphabet, containing all IUPAC smybols minus the gap..
Definition: dna15.hpp:51
The five letter DNA alphabet of A,C,G,T and the unknown character N..
Definition: dna5.hpp:51
Quality type for traditional Sanger and modern Illumina Phred scores..
Definition: phred42.hpp:47
The generic concept for sequence file in formats.
Definition: sequence_file/input_format_concept.hpp:99
A class for reading sequence files, e.g. FASTA, FASTQ ...
Definition: sequence_file/input.hpp:209
static void stream_deleter_default(std::basic_istream< stream_char_type > *ptr)
Stream deleter with default behaviour (ownership assumed).
Definition: sequence_file/input.hpp:499
void const_reference
The const_reference type is void, because files are not const-iterable.
Definition: sequence_file/input.hpp:268
std::default_sentinel_t sentinel
The type returned by end().
Definition: sequence_file/input.hpp:278
sequence_file_input(std::filesystem::path filename, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from filename.
Definition: sequence_file/input.hpp:313
reference front() noexcept
Return the record we are currently at in the file.
Definition: sequence_file/input.hpp:465
iterator begin()
Returns an iterator to current position in the file.
Definition: sequence_file/input.hpp:412
sequence_file_input_options_type options
The options are public and its members can be set directly.
Definition: sequence_file/input.hpp:474
typename traits_type::template sequence_container< typename traits_type::sequence_alphabet > sequence_type
The type of field::seq (std::vector <seqan3::dna5> by default).
Definition: sequence_file/input.hpp:246
sentinel end() noexcept
Returns a sentinel for comparison with iterator.
Definition: sequence_file/input.hpp:437
record_type record_buffer
Buffer for a single record.
Definition: sequence_file/input.hpp:482
char stream_char_type
Character type of the stream(s).
Definition: sequence_file/input.hpp:222
sequence_file_input(stream_type &stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format > >
This is an overloaded member function, provided for convenience. It differs from the above function o...
sequence_file_input(stream_type &&stream, file_format const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, typename sequence_file_input<>::selected_field_ids, type_list< file_format > >
This is an overloaded member function, provided for convenience. It differs from the above function o...
size_t size_type
An unsigned integer type, usually std::size_t.
Definition: sequence_file/input.hpp:270
stream_ptr_t primary_stream
The primary stream is the user provided stream or the file stream if constructed from filename.
Definition: sequence_file/input.hpp:505
sequence_file_input(sequence_file_input const &)=delete
Copy construction is explicitly deleted, because you can't have multiple access to the same file.
typename traits_type::template id_container< typename traits_type::id_alphabet > id_type
The type of field::id (std::string by defaul).
Definition: sequence_file/input.hpp:248
~sequence_file_input()=default
Destructor is defaulted.
sequence_file_input & operator=(sequence_file_input &&)=default
Move assignment is defaulted.
sequence_file_input(sequence_file_input &&)=default
Move construction is defaulted.
void const_iterator
The const iterator type is void, because files are not const-iterable.
Definition: sequence_file/input.hpp:276
sequence_file_input & operator=(sequence_file_input const &)=delete
Copy assignment is explicitly deleted, because you can't have multiple access to the same file.
sequence_file_input(stream_type &stream, file_format const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, typename sequence_file_input<>::selected_field_ids, type_list< file_format > >
Deduces the sequence input file type from the stream and the format.
sequence_file_input(stream_t &&stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: sequence_file/input.hpp:380
sequence_file_input()=delete
Default constructor is explicitly deleted, you need to give a stream or file name.
std::streampos position_buffer
Buffer for the previous record position.
Definition: sequence_file/input.hpp:486
traits_type_ traits_type
A traits type that defines aliases and template for storage of the fields.
Definition: sequence_file/input.hpp:216
selected_field_ids_ selected_field_ids
A seqan3::fields list with the fields selected for the record.
Definition: sequence_file/input.hpp:218
static void stream_deleter_noop(std::basic_istream< stream_char_type > *)
Stream deleter that does nothing (no ownership assumed).
Definition: sequence_file/input.hpp:496
bool first_record_was_read
Tracks whether the very first record is buffered when calling begin().
Definition: sequence_file/input.hpp:510
sequence_file_input(stream_type &&stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format > >
Deduces the sequence input file type from the stream, the format and the field ids.
typename traits_type::template quality_container< typename traits_type::quality_alphabet > quality_type
The type of field::qual (std::vector <seqan3::phred42> by default).
Definition: sequence_file/input.hpp:250
sequence_file_input(stream_t &stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from an existing stream and with specified format.
Definition: sequence_file/input.hpp:364
friend iterator
Befriend iterator so it can access the buffers.
Definition: sequence_file/input.hpp:623
std::unique_ptr< sequence_format_base > format
An instance of the detected/selected format.
Definition: sequence_file/input.hpp:620
std::vector< char > stream_buffer
A larger (compared to stl default) stream buffer to use when reading from a file.
Definition: sequence_file/input.hpp:484
valid_formats_ valid_formats
A seqan3::type_list with the possible formats.
Definition: sequence_file/input.hpp:220
bool at_end
File is at position 1 behind the last record.
Definition: sequence_file/input.hpp:512
stream_ptr_t secondary_stream
The secondary stream is a compression layer on the primary or just points to the primary (no compress...
Definition: sequence_file/input.hpp:507
void read_next_record()
Tell the format to move to the next record and update the buffer.
Definition: sequence_file/input.hpp:517
sequence_record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids > record_type
The type of the record, a specialisation of seqan3::record; acts as a tuple of the selected field typ...
Definition: sequence_file/input.hpp:256
Auxiliary concept that checks whether a type is a specialisation of seqan3::fields.
Definition: detail/record.hpp:35
Auxiliary concept that checks whether a type is a seqan3::type_list and all types meet seqan3::sequen...
Definition: sequence_file/input_format_concept.hpp:171
T data(T... args)
Provides auxiliary data structures and functions for seqan3::record and seqan3::fields.
Provides seqan3::dna15, container aliases and string literals.
Provides seqan3::dna5, container aliases and string literals.
Provides the seqan3::sequence_file_format_genbank class.
Provides the seqan3::format_sam.
T get(T... args)
field
An enumerator for the fields used in file formats.
Definition: record.hpp:63
void set_format(format_variant_type &format, std::filesystem::path const &file_name)
Sets the file format according to the file name extension.
Definition: io/detail/misc.hpp:68
auto make_secondary_istream(std::basic_istream< char_t > &primary_stream, std::filesystem::path &filename) -> std::unique_ptr< std::basic_istream< char_t >, std::function< void(std::basic_istream< char_t > *)> >
Depending on the magic bytes of the given stream, return a decompression stream or forward the primar...
Definition: misc_input.hpp:80
Provides the seqan3::detail::in_file_iterator class template.
Checks whether from can be explicitly converted to to.
A more refined container concept than seqan3::container.
The requirements a traits_type for seqan3::sequence_file_input must meet.
Refines seqan3::alphabet and adds assignability.
A concept that indicates whether a writable alphabet represents quality scores.
Provides exceptions used in the I/O module.
Stream concepts.
Provides various utility functions required only for input.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides seqan3::phred42 quality scores.
Provides quality alphabet composites.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_record.
T size(T... args)
void read_sequence_record(ts &&... args)
Forwards to the seqan3::sequence_file_input_format::read_sequence_record interface.
Definition: sequence_file/input_format_concept.hpp:47
A class template that holds a choice of seqan3::field.
Definition: record.hpp:128
static constexpr bool contains(field f)
Whether a field is contained in the parameter pack.
Definition: record.hpp:149
void clear() noexcept(noexcept(std::apply(expander, std::declval< record & >())))
Clears containers that provide .clear() and (re-)initialises all other elements with = {}.
Definition: record.hpp:237
The specific selected format to read the records from.
Definition: sequence_file/input.hpp:586
void read_sequence_record(std::istream &instream, record_type &record_buffer, std::streampos &position_buffer, sequence_file_input_options_type const &options) override
Reads the next format specific record from the given istream.
Definition: sequence_file/input.hpp:599
selected_sequence_format(selected_sequence_format const &)=default
Default.
selected_sequence_format & operator=(selected_sequence_format &&)=default
Default.
selected_sequence_format(selected_sequence_format &&)=default
Default.
selected_sequence_format & operator=(selected_sequence_format const &)=default
Default.
detail::sequence_file_input_format_exposer< format_t > _format
The selected format stored as a format exposer object.
Definition: sequence_file/input.hpp:616
An abstract base class to store the selected input format.
Definition: sequence_file/input.hpp:544
sequence_format_base & operator=(sequence_format_base &&)=default
Default.
sequence_format_base(sequence_format_base const &)=default
Default.
sequence_format_base & operator=(sequence_format_base const &)=default
Default.
sequence_format_base(sequence_format_base &&)=default
Default.
virtual void read_sequence_record(std::istream &instream, record_type &record_buffer, std::streampos &position_buffer, sequence_file_input_options_type const &options)=0
Reads the next format specific record from the given istream.
A traits type that specifies input as amino acids.
Definition: sequence_file/input.hpp:169
The default traits for seqan3::sequence_file_input.
Definition: sequence_file/input.hpp:133
char id_alphabet
The alphabet for an identifier string is char.
Definition: sequence_file/input.hpp:150
Type that contains multiple types.
Definition: type_list.hpp:29
Provides traits for seqan3::type_list.
T visit(T... args)