18 template <
typename keyType,
typename valueType>
28 virtual bool convert(
char *s, keyType *T, valueType *V) = 0;
30 virtual bool convert(
char *s, keyType *T) = 0;
35 virtual void splitgrp(
const keyType &key, uint32_t &grp, keyType &keyInGroup) = 0;
39 virtual void combgrp(keyType &key, uint32_t &grp, keyType &keyInGroup) = 0;
42 template<
typename keyType,
typename valueType>
46 virtual bool getFileIsSorted() = 0;
47 virtual bool getNext(keyType *T, valueType *V) = 0;
48 virtual void finish() =0;
49 virtual void reset() = 0;
60 template <
typename keyType,
typename valueType>
67 inline bool convert(
char *s, keyType *k, valueType *v) {
76 while (*s ==
'A' || *s ==
'C' || *s ==
'T' || *s ==
'G') {
98 inline bool convert(
char *s, keyType *k) {
102 void splitgrp(
const keyType &key, uint32_t &grp, keyType &keyInGroup) {
104 keyType high = (key >> mvcnt);
108 keyInGroup = (key & (lowmask-1));
111 void combgrp(keyType &key, uint32_t &grp, keyType &keyInGroup) {
123 std::stringstream ss;
124 if (word <= 1024) ss << word;
125 else if (word <= 10240) ss << std::setprecision(2) << word*1.0/1024<<
"K";
126 else if (word <= 1048576) ss << word/1024<<
"K";
127 else if (word <= 10485760) ss << word*1.0/1048576<<
"M";
128 else if (word <= (1048576<<10)) ss << word/1048576<<
"M";
129 else ss << word*1.0/(1<<30) <<
"G";
136 std::vector<std::string>
split(
const char * str,
char deli) {
137 std::istringstream ss(str);
139 std::vector<std::string> ret;
140 while(std::getline(ss, token, deli)) {
142 ret.push_back(token);
148 template <
typename keyType,
typename valueType>
158 if (buf[strlen(buf)-1]==
'\n')
159 buf[strlen(buf)-1] =
'\0';
161 printf(
"OpenFile %s %x\n",fname,f);
169 bool getFileIsSorted() {
175 bool getNext(keyType *T, valueType *V) {
177 if (fgets(buf,
sizeof(buf),f)==NULL)
return false;
182 template <
typename keyType,
typename valueType>
186 bool friend operator <(
const KVpair &a,
const KVpair &b) {
189 } __attribute__((packed));
193 template <
typename keyType,
typename valueType>
197 static const int buflen = 1024;
200 unsigned char buf[1024*64];
207 fIsSorted = _fIsSorted;
210 if (buf[strlen(buf)-1]==
'\n')
211 buf[strlen(buf)-1] =
'\0';
213 printf(
"OpenFile to binary read Kmers %s %x\n",fname,f);
219 printf(
"Do not support reset()\n");
221 bool getFileIsSorted() {
227 bool getNext(keyType *k, valueType *v) {
229 max = fread(buf,kl+vl,buflen,f);
230 if (max == 0)
return false;
235 memcpy( (
void *) k, buf + curr*(kl+vl), kl);
236 memcpy( (
void *) v, buf + curr*(kl+vl)+kl, vl);
244 template <
typename keyType,
typename valueType>
247 static const int buflen = 8192;
250 unsigned char buf[buflen * 2];
253 bool isclosed =
false;
255 static const valueType EMPTYVALUE = ~0;
256 bool valid(uint32_t value) {
257 if (vl == 1)
return value!=0xFF;
258 if (vl == 2)
return value!=0xFFFF;
259 if (vl == 4)
return value!=0xFFFFFFFFUL;
266 if (buf[strlen(buf)-1]==
'\n')
267 buf[strlen(buf)-1] =
'\0';
268 if (isRead = _isRead)
270 else f = fopen(buf,
"wb");
271 memset(buf,0,
sizeof(buf));
272 printf(
"OpenFile to binary read/write Multivalue Kmers file %s %x\n",fname,f);
279 fwrite(buf,
sizeof(buf[0]), curr, f);
288 bool getFileIsSorted() {
295 memmove(buf, buf+curr, max - curr);
299 max += fread(buf+max,1,buflen, f);
302 bool get(
void * mem, uint32_t l) {
303 if (curr + l >= max) getmore();
304 if (curr + l > max)
return false;
305 memcpy(mem,buf+curr,l);
309 bool getNext(keyType *k, valueType *v) {
310 if (!
get(k,kl))
return false;
313 if (!valid(*v))
return true;
335 void add(
void * mem, uint32_t l) {
336 memcpy(buf+curr, mem, l);
338 if (curr >= buflen) {
339 fwrite( buf, 1, buflen, f);
341 memcpy(buf,buf+buflen,curr-buflen);
346 void write(keyType *k ,valueType *v) {
353 void write(keyType *k, std::vector<valueType> &vv) {
354 add ( (
void *) k, kl);
355 void *p; uint8_t a8; uint16_t a16; uint32_t a32;
356 if (vl == 1) p = &a8;
357 if (vl == 2) p = &a16;
358 if (vl == 4) p = &a32;
363 a8 =0xFF; a16 = 0xFFFF; a32 = 0xFFFFFFFF;
370 template <
typename keyType>
373 virtual void finish() =0;
374 virtual bool getNext(keyType *k) =0;
377 template <
typename KVpair>
380 static const int buflen = 16;
384 bool isclosed =
false;
389 if (buf[strlen(buf)-1]==
'\n')
390 buf[strlen(buf)-1] =
'\0';
392 printf(
"OpenFile to read Kmers %s %x\n",fname,f);
394 printf(
"errno %d %s\n",errno,strerror(errno));
405 bool getNext(
KVpair *ret) {
407 max = fread(buff,
sizeof(buff[0]),buflen,f);
408 memset(ret,0xFFFFFFFFUL,
sizeof(
KVpair));
409 if (max == 0)
return false;
412 memcpy(ret, &buff[curr],
sizeof(buff[curr]));
418 template <
typename KVpair>
427 if (buf[strlen(buf)-1]==
'\n')
428 buf[strlen(buf)-1] =
'\0';
430 printf(
"OpenFile to write Kmers %s %x\n",fname,f);
432 memset(buf,0,
sizeof(buf));
434 static const int buflen = 16;
436 memcpy(&buf[curr],p,
sizeof(buf[curr]));
438 if (curr == buflen) {
439 fwrite(buf,
sizeof(buf[0]),buflen,f);
444 fwrite(buf,
sizeof(buf[0]),curr,f);
451 template <
typename keyType>
455 vector<keyType> * vK;
461 keyType k; uint64_t v;
462 vK =
new vector<keyType>();
463 while (reader->getNext(&k, &v)) {
466 sort(vK->begin(),vK->end());
468 if (tmpfilename != NULL) {
469 string binaryfilename (tmpfilename);
483 bool getNext(keyType *k) {
484 if (binaryReader!= NULL)
485 return binaryReader->getNext(k);
487 if (pointer == vK->size()) {
491 *k = (*vK)[pointer++];
497 binaryReader->finish();
502 template <
typename keyType,
typename valueType>
505 vector<compressFileReader <keyType, valueType> *> readerV;
506 vector< vector< int > > NCBI;
512 bool friend operator <(
const KIDpair &a,
const KIDpair &b) {
513 if (a.finished != b.finished)
return (((
int) a.finished) > ((
int) b.finished));
519 for (
auto f: fV) fclose(f);
522 printf(
" Do not support reset() \n");
525 vector<vector<int> > NCBI_local;
526 vector<int> localshift;
527 vector<vector<string> > NCBI_ID;
528 vector<KmerReader<uint64_t> *> readers;
529 vector<MultivalueFileReaderWriter<uint64_t, uint16_t> *> grpreaders;
530 priority_queue<KIDpair> PQ;
531 bool combineMode =
false;
532 uint32_t combineCount;
533 bool getFileIsSorted() {
536 void groupFile(
string fname, vector<string> lf,
string prefix,
string suffix, int32_t idshift,
bool useBinaryKmerFile,uint32_t KmerLength,
const char * tmpfolder) {
537 vector<KmerReader<keyType> *> readers;
538 priority_queue<KIDpair> PQN;
540 string fname = prefix + s + suffix;
541 if (useBinaryKmerFile)
544 string tmpfname(tmpfolder); tmpfname = tmpfname + s +
".bintmp";
548 readers[readers.size()-1]->getNext(&key);
549 KIDpair kid = {key, idshift+readers.size()-1,
false};
556 keyType key = PQN.top().k;
557 uint32_t
id = PQN.top().id;
558 vector<uint16_t> ret;
559 if (PQN.top().finished) {
560 for (
auto r: readers) {
568 while (PQN.top().k == key && !PQN.top().finished) {
569 int tid = PQN.top().id;
572 bool finish = !readers[tid-idshift]->getNext(&nextk);
574 KIDpair kid = {nextk, tid, finish};
577 writer->write(&key, ret);
580 vector< vector<uint16_t> > grpTmpValue;
582 taxoTreeBuilder(
const char * NCBIfname,
const char * fnameprefix,
const char * fnamesuffix,
const char * tmpFileDirectory, uint32_t KmerLength, uint32_t splitbit,
bool useBinaryKmerFile =
true ) {
585 string prefix ( fnameprefix);
586 string suffix (fnamesuffix);
587 fNCBI = fopen(NCBIfname,
"r");
591 fgets(buf, 4096, fNCBI);
592 vector<string> vv =
split(buf,
'\t');
593 levelcount = vv.size()/3;
596 NCBI_local.resize(levelcount);
597 NCBI_ID.resize(levelcount);
599 vector<string> fnames;
601 if (fgets(buf, 4096, fNCBI) == NULL)
break;
602 vector<string> vv =
split(buf,
'\t');
603 if (vv.size()<2)
break;
604 for (
int i = 0 ; i*3 < vv.size(); i++) {
605 int localID = atoi(vv[i*3].c_str());
606 NCBI_local[i].push_back(localID);
607 NCBI_ID[i].push_back(vv[i*3+1]);
609 fnames.push_back(vv[1]);
612 localshift.push_back(1);
613 for (
int i = 0; i < levelcount; i++)
614 localshift.push_back(localshift[i] + *max_element(NCBI_local[i].begin(), NCBI_local[i].end())+1);
617 combineMode = (fnames.size()>nn);
620 int combineCount = 0;
621 vector<string> * fnamesInThisgrp ;
622 vector<string> grpfnames;
623 while (curr < fnames.size()) {
624 if (curr + nn < fnames.size())
625 fnamesInThisgrp =
new vector<string> (fnames.begin()+curr, fnames.begin()+curr+nn);
627 fnamesInThisgrp =
new vector<string> (fnames.begin()+curr, fnames.end());
629 string tmpFolder(tmpFileDirectory);
631 ss<<tmpFolder<<
"TMP"<<grpfnames.size();
634 grpfnames.push_back(fnamegrp);
635 printf(
"merge kmer files %d %d to grp %s\n", curr, curr+fnamesInThisgrp->size()-1, fnamegrp.c_str());
636 groupFile(fnamegrp, *fnamesInThisgrp, prefix, suffix, curr, useBinaryKmerFile,KmerLength,tmpFileDirectory);
637 curr += fnamesInThisgrp->size();
638 delete fnamesInThisgrp;
640 combineCount = grpfnames.size();
641 for (
string v: grpfnames) {
644 uint16_t valuebuf[1024];
645 grpreaders[grpreaders.size()-1]->getNext(&key, valuebuf);
646 vector<uint16_t> Vvaluebuf;
647 for (
int i = 0 ; grpreaders[0]->valid(valuebuf[i]); i++)
648 Vvaluebuf.push_back(valuebuf[i]);
649 grpTmpValue.push_back(Vvaluebuf);
650 KIDpair kid = {key, grpreaders.size()-1,
false};
655 for (
int i = 0 ; i < NCBI_ID.size(); i++) {
656 string fname = prefix + fnames[i] + suffix;
657 if (useBinaryKmerFile)
660 string tmpfname(tmpFileDirectory); tmpfname = tmpfname + fnames[i] +
".bintmp";
664 readers[readers.size()-1]->getNext(&key);
665 KIDpair kid = {key, readers.size()-1,
false};
669 string IDLfname(tmpFileDirectory); IDLfname+=
"IDList.txt";
670 FILE * IDLf; IDLf = fopen(IDLfname.c_str(),
"w");
671 for (
int t : localshift) {
672 fprintf(IDLf,
"%d\n",t);
678 for (
int i = 0 ; i < grpreaders.size(); i++)
679 delete grpreaders[i];
682 for (
int i = 0 ; i < readers.size(); i++)
686 bool getNext( keyType *k, valueType *v) {
688 keyType key = PQ.top().k;
690 if (PQ.top().finished) {
695 while (PQ.top().k == key && !PQ.top().finished) {
701 ret.insert(ret.end(),grpTmpValue[tid].begin(),grpTmpValue[tid].end());
702 int ll = grpTmpValue[tid].size();
706 uint16_t valuebuf[1024];
707 finish = !grpreaders[tid]->getNext(&nextk, valuebuf);
708 grpTmpValue[tid].clear();
709 for (
int i = 0; grpreaders[tid]->valid(valuebuf[i]); i++)
710 grpTmpValue[tid].push_back(valuebuf[i]);
716 finish = !readers[tid]->getNext(&nextk);
719 KIDpair kid = {nextk, tid, finish};
724 for (
int i = 0; i< levelcount; i++) {
726 for (
int j = 0; j < ret.size() && flag; j++)
727 flag = (NCBI_local[i][ret[j]]==NCBI_local[i][ret[0]]);
729 *v = localshift[i] + NCBI_local[i][ret[0]];
733 *v = localshift[levelcount];
Definition: io_helper.h:371
virtual void splitgrp(const keyType &key, uint32_t &grp, keyType &keyInGroup)=0
split a keyTypeype value into two parts: groupID/keyInGroup by the highest splitbit bits...
virtual void combgrp(keyType &key, uint32_t &grp, keyType &keyInGroup)=0
combine groupID/keyInGroup to the origional key
std::vector< std::string > split(const char *str, char deli)
split a c-style string with delimineter chara.
Definition: io_helper.h:136
Definition: io_helper.h:149
read kmer from unsorted txt file and sort .
Definition: io_helper.h:452
interface for converting a key from its raw format to a keyTypeype. Split key into groups...
Definition: io_helper.h:19
Definition: io_helper.h:503
void combgrp(keyType &key, uint32_t &grp, keyType &keyInGroup)
combine groupID/keyInGroup to the origional key
Definition: io_helper.h:111
bool convert(char *s, keyType *k, valueType *v)
convert a input-style line to key/value pair.
Definition: io_helper.h:67
Definition: io_helper.h:183
void splitgrp(const keyType &key, uint32_t &grp, keyType &keyInGroup)
split a keyTypeype value into two parts: groupID/keyInGroup by the highest splitbit bits...
Definition: io_helper.h:102
Definition: io_helper.h:378
bool convert(char *s, keyType *k)
skip the value.
Definition: io_helper.h:98
Definition: io_helper.h:43
std::string human(uint64_t word)
convert a 64-bit Integer to human-readable format in K/M/G. e.g, 102400 is converted to "100K"...
Definition: io_helper.h:122
uint8_t splitbit
group the keys according to the highest bits.
Definition: io_helper.h:64
Definition: io_helper.h:194
IOHelper for Constant-Length Kmers.
Definition: io_helper.h:61
uint8_t kmerlength
Assume all kmers are of the same length.
Definition: io_helper.h:63
Definition: io_helper.h:419
Definition: io_helper.h:245
virtual bool convert(char *s, keyType *T, valueType *V)=0
convert a input-style line to key/value pair.