69 template <
typename stream_type,
70 typename seq_legal_alph_type,
71 typename ref_seqs_type,
72 typename ref_ids_type,
73 typename stream_pos_type,
77 typename ref_seq_type,
79 typename ref_offset_type,
85 typename tag_dict_type,
86 typename e_value_type,
87 typename bit_score_type>
90 ref_seqs_type & ref_seqs,
92 stream_pos_type & position_buffer,
97 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
99 ref_offset_type & ref_offset,
100 cigar_type & cigar_vector,
104 tag_dict_type & tag_dict,
105 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
106 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
108 template <
typename stream_type,
109 typename header_type,
112 typename ref_seq_type,
113 typename ref_id_type,
117 typename tag_dict_type>
120 [[maybe_unused]] header_type && header,
121 [[maybe_unused]] seq_type && seq,
122 [[maybe_unused]] qual_type && qual,
123 [[maybe_unused]] id_type &&
id,
124 [[maybe_unused]] int32_t
const offset,
125 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
126 [[maybe_unused]] ref_id_type && ref_id,
128 [[maybe_unused]] cigar_type && cigar_vector,
129 [[maybe_unused]]
sam_flag const flag,
130 [[maybe_unused]] uint8_t
const mapq,
131 [[maybe_unused]] mate_type && mate,
132 [[maybe_unused]] tag_dict_type && tag_dict,
133 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(e_value),
134 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(bit_score));
170 ret[
static_cast<index_t
>(
'I')] = 1;
171 ret[
static_cast<index_t
>(
'D')] = 2;
172 ret[
static_cast<index_t
>(
'N')] = 3;
173 ret[
static_cast<index_t
>(
'S')] = 4;
174 ret[
static_cast<index_t
>(
'H')] = 5;
175 ret[
static_cast<index_t
>(
'P')] = 6;
176 ret[
static_cast<index_t
>(
'=')] = 7;
177 ret[
static_cast<index_t
>(
'X')] = 8;
185 static uint16_t
reg2bin(int32_t beg, int32_t end)
noexcept
188 if (beg >> 14 == end >> 14)
189 return ((1 << 15) - 1) / 7 + (beg >> 14);
190 if (beg >> 17 == end >> 17)
191 return ((1 << 12) - 1) / 7 + (beg >> 17);
192 if (beg >> 20 == end >> 20)
193 return ((1 << 9) - 1) / 7 + (beg >> 20);
194 if (beg >> 23 == end >> 23)
195 return ((1 << 6) - 1) / 7 + (beg >> 23);
196 if (beg >> 26 == end >> 26)
197 return ((1 << 3) - 1) / 7 + (beg >> 26);
207 template <
typename stream_view_type, std::
integral number_type>
218 template <
typename stream_view_type>
224 template <
typename stream_view_type,
typename value_type>
226 stream_view_type && stream_view,
227 value_type
const & SEQAN3_DOXYGEN_ONLY(value));
229 template <
typename stream_view_type>
232 template <
typename cigar_input_type>
239template <
typename stream_type,
240 typename seq_legal_alph_type,
241 typename ref_seqs_type,
242 typename ref_ids_type,
243 typename stream_pos_type,
246 typename offset_type,
247 typename ref_seq_type,
248 typename ref_id_type,
249 typename ref_offset_type,
255 typename tag_dict_type,
256 typename e_value_type,
257 typename bit_score_type>
261 ref_seqs_type & ref_seqs,
263 stream_pos_type & position_buffer,
267 offset_type & offset,
268 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
269 ref_id_type & ref_id,
270 ref_offset_type & ref_offset,
271 cigar_type & cigar_vector,
275 tag_dict_type & tag_dict,
276 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
277 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
279 static_assert(detail::decays_to_ignore_v<ref_offset_type>
280 || detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
281 "The ref_offset must be a specialisation of std::optional.");
283 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
284 "The type of field::mapq must be uint8_t.");
286 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
287 "The type of field::flag must be seqan3::sam_flag.");
291 [[maybe_unused]] int32_t offset_tmp{};
292 [[maybe_unused]] int32_t ref_length{};
314 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
326 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>)
331 auto & reference_ids = header.
ref_ids();
337 header.
ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
348 +
"' found in BAM file header (header.ref_ids():",
352 else if (id_it->second != ref_idx)
358 " does not correspond to the position ",
360 " in the header (header.ref_ids():",
364 else if (std::get<0>(header.
ref_id_info[id_it->second]) != l_ref)
366 throw format_error{
"Provided reference has unequal length as specified in the header."};
378 position_buffer = stream.tellg();
382 if (core.
refID >=
static_cast<int32_t
>(header.
ref_ids().size()) || core.
refID < -1)
386 "' is not in range of ",
387 "header.ref_ids(), which has size ",
391 else if (core.
refID > -1)
402 if constexpr (!detail::decays_to_ignore_v<mate_type>)
416 if constexpr (!detail::decays_to_ignore_v<id_type>)
424 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
426 int32_t seq_length{};
428 int32_t soft_clipping_end{};
450 if constexpr (detail::decays_to_ignore_v<seq_type>)
452 auto skip_sequence_bytes = [&]()
459 skip_sequence_bytes();
463 using alph_t = std::ranges::range_value_t<
decltype(
seq)>;
464 constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
466 for (
auto [d1, d2] : seq_stream)
488 return static_cast<char>(chr + 33);
490 if constexpr (!detail::decays_to_ignore_v<qual_type>)
499 assert(remaining_bytes >= 0);
502 while (tags_view.size() > 0)
504 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
512 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
517 if (core.
l_seq != 0 && offset_tmp == core.
l_seq)
519 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
526 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
527 "stored in the optional field CG. You need to read in the field::tags and "
528 "field::seq in order to access this information.")};
532 auto it = tag_dict.
find(
"CG"_tag);
534 if (it == tag_dict.end())
536 "The cigar string '",
540 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
541 "stored in the optional field CG but this tag is not present in the given ",
545 int32_t seq_length{};
548 int32_t soft_clipping_end{};
557template <
typename stream_type,
558 typename header_type,
561 typename ref_seq_type,
562 typename ref_id_type,
566 typename tag_dict_type>
569 [[maybe_unused]] header_type && header,
570 [[maybe_unused]] seq_type && seq,
571 [[maybe_unused]] qual_type && qual,
572 [[maybe_unused]] id_type &&
id,
573 [[maybe_unused]] int32_t
const offset,
574 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
575 [[maybe_unused]] ref_id_type && ref_id,
577 [[maybe_unused]] cigar_type && cigar_vector,
578 [[maybe_unused]]
sam_flag const flag,
579 [[maybe_unused]] uint8_t
const mapq,
580 [[maybe_unused]] mate_type && mate,
581 [[maybe_unused]] tag_dict_type && tag_dict,
582 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(e_value),
583 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(bit_score))
589 "The seq object must be a std::ranges::forward_range over "
590 "letters that model seqan3::alphabet.");
593 "The id object must be a std::ranges::forward_range over "
594 "letters that model seqan3::alphabet.");
597 "The ref_seq object must be a std::ranges::forward_range "
598 "over letters that model seqan3::alphabet.");
600 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
602 static_assert((std::ranges::forward_range<ref_id_type> || std::integral<std::remove_reference_t<ref_id_type>>
603 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>,
std::optional>),
604 "The ref_id object must be a std::ranges::forward_range "
605 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
609 "The qual object must be a std::ranges::forward_range "
610 "over letters that model seqan3::alphabet.");
613 "The mate object must be a std::tuple of size 3 with "
614 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
615 "2) a std::integral or std::optional<std::integral>, and "
616 "3) a std::integral.");
619 ((std::ranges::forward_range<decltype(std::get<0>(
mate))>
621 || detail::is_type_specialisation_of_v<
623 std::optional>)&&(std::integral<std::remove_cvref_t<decltype(std::get<1>(
mate))>>
624 || detail::is_type_specialisation_of_v<
626 std::optional>)&&std::integral<std::remove_cvref_t<decltype(std::get<2>(
mate))>>),
627 "The mate object must be a std::tuple of size 3 with "
628 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
629 "2) a std::integral or std::optional<std::integral>, and "
630 "3) a std::integral.");
633 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
635 if constexpr (detail::decays_to_ignore_v<header_type>)
637 throw format_error{
"BAM can only be written with a header but you did not provide enough information! "
638 "You can either construct the output file with ref_ids and ref_seqs information and "
639 "the header will be created for you, or you can access the `header` member directly."};
660 int32_t l_text{
static_cast<int32_t
>(os.
str().size())};
665 int32_t n_ref{
static_cast<int32_t
>(header.
ref_ids().size())};
668 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
670 int32_t l_name{
static_cast<int32_t
>(header.
ref_ids()[ridx].size()) + 1};
685 int32_t ref_length{};
688 if (!std::ranges::empty(cigar_vector))
690 int32_t dummy_seq_length{};
691 for (
auto & [
count, operation] : cigar_vector)
695 if (cigar_vector.size() >= (1 << 16))
698 cigar_vector.resize(2);
699 cigar_vector[0] =
cigar{
static_cast<uint32_t
>(std::ranges::distance(
seq)),
'S'_cigar_operation};
700 cigar_vector[1] =
cigar{
static_cast<uint32_t
>(ref_length),
'N'_cigar_operation};
710 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(
id), 254) + 1;
711 read_name_size +=
static_cast<uint8_t
>(read_name_size == 1);
719 static_cast<uint16_t
>(cigar_vector.size()),
721 static_cast<int32_t
>(std::ranges::distance(
seq)),
723 get<1>(
mate).value_or(-1),
726 auto check_and_assign_id_to = [&header]([[maybe_unused]]
auto & id_source, [[maybe_unused]]
auto & id_target)
730 if constexpr (!detail::decays_to_ignore_v<id_t>)
732 if constexpr (std::integral<id_t>)
734 id_target = id_source;
736 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
738 id_target = id_source.value_or(-1);
742 if (!std::ranges::empty(id_source))
746 if constexpr (std::ranges::contiguous_range<
decltype(id_source)>
747 && std::ranges::sized_range<
decltype(id_source)>
748 && std::ranges::borrowed_range<
decltype(id_source)>)
759 "The ref_id type is not convertible to the reference id information stored in the "
760 "reference dictionary of the header object.");
762 id_it = header.
ref_dict.find(id_source);
770 "not be found in BAM header ref_dict: ",
775 id_target = id_it->second;
782 check_and_assign_id_to(
ref_id, core.refID);
785 check_and_assign_id_to(get<0>(
mate), core.next_refID);
788 core.block_size =
sizeof(core) - 4 + core.l_read_name + core.n_cigar_op * 4
790 (core.l_seq + 1) / 2 +
792 tag_dict_binary_str.
size();
796 if (std::ranges::empty(
id))
803 for (
auto [cigar_count, op] : cigar_vector)
805 cigar_count = cigar_count << 4;
811 using alph_t = std::ranges::range_value_t<seq_type>;
812 constexpr auto to_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
815 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
820 stream_it =
static_cast<char>(compressed_chr);
824 stream_it =
static_cast<char>(
to_rank(to_dna16[
to_rank(*sit)]) << 4);
827 if (std::ranges::empty(
qual))
834 if (std::ranges::distance(
qual) != core.l_seq)
837 ". Got quality with size ",
838 std::ranges::distance(
qual),
845 return static_cast<char>(
to_rank(chr));
851 stream << tag_dict_binary_str;
856template <
typename stream_view_type,
typename value_type>
858 stream_view_type && stream_view,
859 value_type
const & SEQAN3_DOXYGEN_ONLY(value))
869 if constexpr (std::integral<value_type>)
873 else if constexpr (std::same_as<value_type, float>)
879 constexpr bool always_false = std::is_same_v<value_type, void>;
880 static_assert(always_false,
"format_bam::read_sam_dict_vector: unsupported value_type");
885 variant = std::move(tmp_vector);
905template <
typename stream_view_type>
915 uint16_t tag =
static_cast<uint16_t
>(*it) << 8;
918 tag +=
static_cast<uint16_t
>(*it);
937 target[tag] =
static_cast<int32_t
>(tmp);
944 target[tag] =
static_cast<int32_t
>(tmp);
951 target[tag] =
static_cast<int32_t
>(tmp);
958 target[tag] =
static_cast<int32_t
>(tmp);
965 target[tag] = std::move(tmp);
972 target[tag] =
static_cast<int32_t
>(tmp);
985 while (!is_char<'\0'>(*it))
998 while (!is_char<'\0'>(*it))
1005 throw format_error{
"Hexadecimal tag has an uneven number of digits!"};
1013 target[tag] = byte_array;
1018 char array_value_type_id = *it;
1021 switch (array_value_type_id)
1046 "must be one of [cCsSiIf] but '",
1047 array_value_type_id,
1054 "SAM tag must be one of [A,i,Z,H,B,f] but '",
1074template <
typename cigar_input_type>
1078 char operation{
'\0'};
1080 int32_t ref_length{}, seq_length{};
1081 uint32_t operation_and_count{};
1082 constexpr char const * cigar_mapping =
"MIDNSHP=X*******";
1083 constexpr uint32_t cigar_mask = 0x0f;
1085 if (n_cigar_op == 0)
1086 return std::tuple{operations, ref_length, seq_length};
1090 while (n_cigar_op > 0)
1093 sizeof(operation_and_count),
1094 reinterpret_cast<char *
>(&operation_and_count));
1095 operation = cigar_mapping[operation_and_count & cigar_mask];
1096 count = operation_and_count >> 4;
1103 return std::tuple{operations, ref_length, seq_length};
1113 auto stream_variant_fn = [&result](
auto && arg)
1118 if constexpr (std::same_as<T, int32_t>)
1121 size_t const absolute_arg = std::abs(arg);
1123 bool const negative = arg < 0;
1124 n = n * n + 2 * negative;
1130 result[result.size() - 1] =
'C';
1131 result.append(
reinterpret_cast<char const *
>(&arg), 1);
1136 result[result.size() - 1] =
'S';
1137 result.append(
reinterpret_cast<char const *
>(&arg), 2);
1142 result[result.size() - 1] =
'c';
1143 int8_t tmp =
static_cast<int8_t
>(arg);
1144 result.append(
reinterpret_cast<char const *
>(&tmp), 1);
1149 result[result.size() - 1] =
's';
1150 int16_t tmp =
static_cast<int16_t
>(arg);
1151 result.append(
reinterpret_cast<char const *
>(&tmp), 2);
1156 result.append(
reinterpret_cast<char const *
>(&arg), 4);
1161 else if constexpr (std::same_as<T, std::string>)
1163 result.append(
reinterpret_cast<char const *
>(arg.data()), arg.size() + 1 );
1165 else if constexpr (!std::ranges::range<T>)
1167 result.append(
reinterpret_cast<char const *
>(&arg),
sizeof(arg));
1171 int32_t sz{
static_cast<int32_t
>(arg.size())};
1172 result.append(
reinterpret_cast<char *
>(&sz), 4);
1173 result.append(
reinterpret_cast<char const *
>(arg.data()),
1174 arg.size() *
sizeof(std::ranges::range_value_t<T>));
1178 for (
auto & [tag, variant] : tag_dict)
1180 result.push_back(
static_cast<char>(tag / 256));
1181 result.push_back(
static_cast<char>(tag % 256));
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:163
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:187
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: alphabet/cigar/cigar.hpp:60
Functionally the same as std::ostreambuf_iterator, but offers writing a range more efficiently.
Definition: fast_ostreambuf_iterator.hpp:40
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=')....
Definition: dna16sam.hpp:48
The actual implementation of seqan3::cigar::operation for documentation purposes only....
Definition: cigar_operation.hpp:48
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:343
Provides seqan3::dna16sam.
T emplace_back(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: alphabet/concept.hpp:155
constexpr void consume(rng_t &&rng)
Iterate over a range (consumes single-pass input ranges).
Definition: core/range/detail/misc.hpp:28
constexpr auto all
Returns a view that includes all elements of the range argument.
Definition: all_view.hpp:204
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
constexpr std::tuple< std::vector< cigar >, int32_t, int32_t > parse_cigar(cigar_input_type &&cigar_input)
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: io/sam_file/detail/cigar.hpp:94
std::string get_cigar_string(std::vector< cigar > const &cigar_vector)
Transforms a vector of cigar elements into a string representation.
Definition: io/sam_file/detail/cigar.hpp:128
constexpr char sam_tag_type_char_extra[12]
Each types SAM tag type extra char id. Index corresponds to the seqan3::detail::sam_tag_variant types...
Definition: sam_tag_dictionary.hpp:45
void update_alignment_lengths(int32_t &ref_length, int32_t &seq_length, char const cigar_operation, uint32_t const cigar_count)
Updates the sequence lengths by cigar_count depending on the cigar operation op.
Definition: io/sam_file/detail/cigar.hpp:51
constexpr char sam_tag_type_char[12]
Each SAM tag type char identifier. Index corresponds to the seqan3::detail::sam_tag_variant types.
Definition: sam_tag_dictionary.hpp:42
constexpr auto take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly_view.hpp:590
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf_view.hpp:107
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: type_list/traits.hpp:470
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: type_pack/traits.hpp:164
constexpr size_t size
The size of a type pack.
Definition: type_pack/traits.hpp:146
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
The generic alphabet concept that covers most data types used in ranges.
Checks whether from can be implicityly converted to to.
Whether a type behaves like a tuple.
Auxiliary functions for the SAM IO.
Provides seqan3::detail::istreambuf.
std::string to_string(value_type &&... values)
Streams all parameters via the seqan3::debug_stream and returns a concatenated string.
Definition: to_string.hpp:29
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides seqan3::debug_stream and related types.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides seqan3::views::slice.
The options type defines various option members that influence the behavior of all or some formats.
Definition: sam_file/output_options.hpp:26
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.