1 Star 0 Fork 0

Velcon-Zheng/bowtie

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
sequence_io.h 5.27 KB
一键复制 编辑 原始数据 按行查看 历史
langmead 提交于 2009-09-25 14:09 . *** empty log message ***
#ifndef SEQUENCE_IO_H_
#define SEQUENCE_IO_H_
#include <string>
#include <vector>
#include <stdexcept>
#include <fstream>
#include <stdio.h>
#include <seqan/sequence.h>
#include <seqan/file.h>
#include "assert_helpers.h"
#include "pat.h"
/**
* Read a sequence file of the given format and alphabet type. Store
* all of the extracted sequences in vector ss. Note that SeqAn's
* policy for when it encounters characters not from the specified
* alphabet is to convert them to the lexicographically smallest
* character in the alphabet.
*/
template <typename TStr, typename TFile>
static void readSequenceFile(const std::string& infile,
std::vector<TStr>& ss,
int64_t& baseCutoff, // limit for total bases
int seqCutoff = -1, // limit for sequences
bool reverse = false)
{
typedef typename Value<TStr>::Type TVal;
static char buf[256 * 1024]; // fairly large input buffer
if(baseCutoff <= 0) return;
FILE *in = fopen(infile.c_str(), "r");
if(in == NULL) {
cerr << "Could not open sequence file" << endl;
throw 1;
}
// Associate large input buffer with FILE *in
if(setvbuf(in, buf, _IOFBF, 256 * 1024) != 0) {
cerr << "Could not create input buffer for sequence file" << endl;
throw 1;
}
// Read entries using SeqAn
int cnt = 0;
while(!feof(in)) {
while(true) {
ss.push_back(TStr()); // add a new empty string to the end
// Fill the new empty string with the next sequence from
// the file. SeqAn allocates just enough mem for it (at
// the expense of lots of file seeks, which can add up)
seqan::read(in, ss.back(), TFile());
if(seqan::empty(ss.back())) {
ss.pop_back();
break;
}
// Enforce the base cutoff
if((int64_t)length(ss.back()) > baseCutoff) {
resize(ss.back(), baseCutoff);
baseCutoff = 0;
} else {
baseCutoff -= length(ss.back());
}
// Reverse the newly-read sequence in-place if desired
if(reverse) {
size_t len = length(ss.back());
for(size_t i = 0; i < len/2; i++) {
TVal t = ss.back()[i];
ss.back()[i] = ss.back()[len-i-1];
ss.back()[len-i-1] = t;
}
}
#ifndef NDEBUG
// Sanity check that all (int) values are in range
for(size_t i = 0; i < length(ss.back()); i++) {
assert_lt(ss.back()[i], (int)(ValueSize<TVal>::VALUE));
assert_geq(ss.back()[i], 0);
}
#endif
cnt++;
// Enforce the sequence cutoff
if(seqCutoff != -1 && cnt >= seqCutoff) {
fclose(in);
return;
}
}
}
fclose(in);
}
/**
* Read a set of sequence files of the given format and alphabet type.
* Store all of the extracted sequences in vector ss.
*/
template <typename TStr, typename TFile>
static void readSequenceFiles(const std::vector<std::string>& infiles,
std::vector<TStr>& ss,
int64_t& baseCutoff,
int seqCutoff = -1,
bool reverse = false)
{
for(size_t i = 0; i < infiles.size() && baseCutoff > 0; i++) {
readSequenceFile<TStr,TFile>(infiles[i], ss, baseCutoff, seqCutoff, reverse);
if(baseCutoff <= 0) break;
}
}
/**
* Read a set of sequence files of the given format and alphabet type.
* Store all of the extracted sequences in vector ss.
*/
template <typename TStr, typename TFile>
static void readSequenceFiles(const std::vector<std::string>& infiles,
std::vector<TStr>& ss,
int seqCutoff = -1,
bool reverse = false)
{
int64_t i = 0xffffffffll;
readSequenceFiles<TStr,TFile>(infiles, ss, i, seqCutoff, reverse);
}
/**
* Parse a comma-delimited list of strings of type T into a vector.
*/
template <typename T>
void readSequenceString(const std::string& s,
std::vector<T>& ss,
int64_t& baseCutoff,
int seqCutoff = -1,
bool reverse = false)
{
// Split string s using comma as a delimiter. Borrowed from C++
// Programming HOWTO 7.3
std::string::size_type lastPos = s.find_first_not_of(",", 0);
std::string::size_type pos = s.find_first_of(",", lastPos);
while (baseCutoff > 0 && (std::string::npos != pos || std::string::npos != lastPos)) {
string stmp = s.substr(lastPos, pos - lastPos);
if((int64_t)stmp.length() < baseCutoff) {
baseCutoff -= stmp.length();
} else {
stmp = stmp.substr(0, baseCutoff);
baseCutoff = 0;
}
if(reverse) {
size_t len = stmp.length();
for(size_t i = 0; i < len/2; i++) {
char tmp = stmp[i];
stmp[i] = stmp[len-i-1];
stmp[len-i-1] = tmp;
}
ss.push_back(T(stmp.c_str()));
} else {
ss.push_back(T(stmp.c_str()));
}
if(seqCutoff != -1 && ss.size() >= (size_t)seqCutoff) {
return;
}
lastPos = s.find_first_not_of(",", pos);
pos = s.find_first_of(",", lastPos);
}
}
/**
* Parse a comma-delimited list of strings of type T into a vector.
* Doesn't require callee to supply a baseCutoff.
*/
template <typename T>
void readSequenceString(const std::string& s,
std::vector<T>& ss,
int seqCutoff = -1,
bool reverse = false)
{
int64_t i = 0xffffffffll;
readSequenceString(s, ss, i, seqCutoff, reverse);
}
#endif /*SEQUENCE_IO_H_*/
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/Velcon-Zheng/bowtie.git
git@gitee.com:Velcon-Zheng/bowtie.git
Velcon-Zheng
bowtie
bowtie
master

搜索帮助