SNAP Library 6.0, Developer Reference  2020-12-09 16:24:20
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
ssmp.cpp
Go to the documentation of this file.
1 //#//////////////////////////////////////////////
2 
3 TSsParserMP::TSsParserMP(const TStr& FNm, const char& Separator, const bool& _SkipLeadBlanks, const bool& _SkipCmt, const bool& _SkipEmptyFld) : SsFmt(ssfSpaceSep),
4  SkipLeadBlanks(_SkipLeadBlanks), SkipCmt(_SkipCmt), SkipEmptyFld(_SkipEmptyFld), LineCnt(0), /*Bf(NULL),*/ SplitCh('\t'), LineStr(), FldV(), FInPt(NULL) {
5  FInPt = TMIn::New(FNm, true);
6  SplitCh = Separator;
7 }
8 
10 }
11 
14 }
15 
16 // Gets and parses the next line, quick version, works with buffers, not chars.
17 bool TSsParserMP::Next() { // split on SplitCh
18  FldV.Clr(false);
19  LineStr.Clr();
20  FldV.Clr();
21  LineCnt++;
22  if (! FInPt->GetNextLnBf(LineStr)) { return false; }
23  if (SkipCmt && !LineStr.Empty() && LineStr[0]=='#') { return Next(); }
24 
25  char* cur = LineStr.CStr();
26  if (SkipLeadBlanks) { // skip leading blanks
27  while (*cur && TCh::IsWs(*cur)) { cur++; }
28  }
29  char *last = cur;
30  while (*cur) {
31  if (SsFmt == ssfWhiteSep) { while (*cur && ! TCh::IsWs(*cur)) { cur++; } }
32  else { while (*cur && *cur!=SplitCh) { cur++; } }
33  if (*cur == 0) { break; }
34  *cur = 0; cur++;
35  FldV.Add(last); last = cur;
36  if (SkipEmptyFld && strlen(FldV.Last())==0) { FldV.DelLast(); } // skip empty fields
37  }
38  FldV.Add(last); // add last field
39  if (SkipEmptyFld && FldV.Empty()) { return Next(); } // skip empty lines
40  return true;
41 }
42 
44  for (int f = 0; f < FldV.Len(); f++) {
45  for (char *c = FldV[f]; *c; c++) {
46  *c = tolower(*c); }
47  }
48 }
49 
50 bool TSsParserMP::GetInt(const int& FldN, int& Val) const {
51  // parsing format {ws} [+/-] +{ddd}
52  int _Val = -1;
53  bool Minus=false;
54  const char *c = GetFld(FldN);
55  while (TCh::IsWs(*c)) { c++; }
56  if (*c=='-') { Minus=true; c++; }
57  if (! TCh::IsNum(*c)) { return false; }
58  _Val = TCh::GetNum(*c); c++;
59  while (TCh::IsNum(*c)){
60  _Val = 10 * _Val + TCh::GetNum(*c);
61  c++;
62  }
63  if (Minus) { _Val = -_Val; }
64  if (*c != 0) { return false; }
65  Val = _Val;
66  return true;
67 }
68 
69 bool TSsParserMP::GetFlt(const int& FldN, double& Val) const {
70  // parsing format {ws} [+/-] +{d} ([.]{d}) ([E|e] [+/-] +{d})
71  const char *c = GetFld(FldN);
72  while (TCh::IsWs(*c)) { c++; }
73  if (*c=='+' || *c=='-') { c++; }
74  if (! TCh::IsNum(*c) && *c!='.') { return false; }
75  while (TCh::IsNum(*c)) { c++; }
76  if (*c == '.') {
77  c++;
78  while (TCh::IsNum(*c)) { c++; }
79  }
80  if (*c=='e' || *c == 'E') {
81  c++;
82  if (*c == '+' || *c == '-' ) { c++; }
83  if (! TCh::IsNum(*c)) { return false; }
84  while (TCh::IsNum(*c)) { c++; }
85  }
86  if (*c != 0) { return false; }
87  Val = atof(GetFld(FldN));
88  return true;
89 }
90 
91 const char* TSsParserMP::DumpStr() const {
92  static TChA ChA(10*1024);
93  ChA.Clr();
94  for (int i = 0; i < FldV.Len(); i++) {
95  ChA += TStr::Fmt(" %d: '%s'\n", i, FldV[i]);
96  }
97  return ChA.CStr();
98 }
99 
100 // Finds number of new line chars in interval [lb, ub)
101 // Assumes that lines end in '\n'
103  return FInPt->CountNewLinesInRange(Lb, Ub);
104 }
105 
107  TVec<uint64> Ret;
108  if (Lb >= GetStreamLen()) {
109  return Ret;
110  }
111  while (Lb < Ub) {
112  // Find line corresponding to Lb
113  uint64 StartPos = FInPt->GetLineStartPos(Lb);
114  uint64 EndPos = FInPt->GetLineEndPos(Lb);
115 
116  // If line ends in given range, add to count
117  if (Lb <= EndPos && EndPos < Ub) {
118  Ret.Add(StartPos);
119  }
120  // Start at next line
121  Lb = EndPos + 1;
122  }
123  return Ret;
124 }
125 
126 // Essesntially the same as TssParser::Next
127 // For parallel load, FldV cannot be shared across many threads
129 {
130  // split on SplitCh
131  FieldsV.Clr();
132 
133  char* cur = FInPt->GetLine(Index);
134 
135  if (SkipLeadBlanks) { // skip leading blanks
136  while (*cur && TCh::IsWs(*cur)) { cur++; }
137  }
138  char *last = cur;
139  while (*cur != 0 && *cur != '\n') {
140  if (SsFmt == ssfWhiteSep) { while (*cur && (*cur != '\n') && ! TCh::IsWs(*cur)) { cur++; } }
141  else { while (*cur && *cur!=SplitCh && (*cur != '\n')) { cur++; } }
142  if (*cur == 0) { break; }
143  if (*cur == '\n') { break; }
144  //*cur = 0;
145  cur++;
146  FieldsV.Add(last); last = cur;
147  if (SkipEmptyFld && strlen(FieldsV.Last())==0) { FieldsV.DelLast(); } // skip empty fields
148  }
149  FieldsV.Add(last); // add last field
150 }
151 
152 int TSsParserMP::GetIntFromFldV(TVec<char*>& FieldsV, const int& FldN) {
153  // parsing format {ws} [+/-] +{ddd}
154  int _Val = -1;
155  bool Minus=false;
156  const char *c = FieldsV[FldN];
157  while (TCh::IsWs(*c)) { c++; }
158  if (*c=='-') { Minus=true; c++; }
159  if (! TCh::IsNum(*c)) { return -1; }
160  _Val = TCh::GetNum(*c); c++;
161  while (TCh::IsNum(*c)){
162  _Val = 10 * _Val + TCh::GetNum(*c);
163  c++;
164  }
165  if (Minus) { _Val = -_Val; }
166  if (*c != 0 && !TCh::IsWs(*c)) { return -1; }
167  return _Val;
168 }
169 
170 double TSsParserMP::GetFltFromFldV(TVec<char*>& FieldsV, const int& FldN) {
171  // parsing format {ws} [+/-] +{d} ([.]{d}) ([E|e] [+/-] +{d})
172  const char *c = FieldsV[FldN];
173  /* skip whitespace at the beginning */
174  while (TCh::IsWs(*c)) { c++; }
175  /* skip the sign */
176  if (*c=='+' || *c=='-') { c++; }
177  /* error, if not a digit or '.' */
178  if (! TCh::IsNum(*c) && *c!='.') { return -1; }
179  /* skip digits */
180  while (TCh::IsNum(*c)) { c++; }
181  if (*c == '.') {
182  c++;
183  while (TCh::IsNum(*c)) { c++; }
184  }
185  /* skip exponent */
186  if (*c=='e' || *c == 'E') {
187  c++;
188  if (*c == '+' || *c == '-' ) { c++; }
189  if (! TCh::IsNum(*c)) { return -1; }
190  while (TCh::IsNum(*c)) { c++; }
191  }
192  if (*c != 0 && !TCh::IsWs(*c)) { return -1; }
193  return atof(FieldsV[FldN]);
194 }
195 
bool Next()
Loads next line from the input file.
Definition: ssmp.cpp:17
bool GetFlt(const int &FldN, double &Val) const
If the field FldN is a float its value is returned in Val and the function returns true...
Definition: ssmp.cpp:69
TVec< uint64 > GetStartPosV(uint64 Lb, uint64 Ub) const
Finds start positions of all lines ending somewhere in [Lb, Ub)
Definition: ssmp.cpp:106
const char * DumpStr() const
Definition: ssmp.cpp:91
static bool IsNum(const char &Ch)
Definition: dt.h:1067
TSsParserMP(const TStr &FNm, const TSsFmt _SsFmt=ssfTabSep, const bool &_SkipLeadBlanks=false, const bool &_SkipCmt=true, const bool &_SkipEmptyFld=false)
Constructor.
bool Empty() const
Definition: dt.h:260
void Clr()
Definition: dt.h:258
TChA LineStr
Current line.
Definition: ssmp.h:15
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
uint64 LineCnt
Number of processed lines so far.
Definition: ssmp.h:13
bool GetInt(const int &FldN, int &Val) const
If the field FldN is an integer its value is returned in Val and the function returns true...
Definition: ssmp.cpp:50
bool SkipLeadBlanks
Ignore leading whitespace characters in a line.
Definition: ssmp.h:10
bool SkipEmptyFld
Skip empty fields (i.e., multiple consecutive separators are considered as one).
Definition: ssmp.h:12
bool GetNextLnBf(TChA &LnChA)
Definition: fl.cpp:763
bool Empty() const
Tests whether the vector is empty.
Definition: ds.h:570
void SkipCommentLines()
Skips lines that begin with a comment character.
Definition: ssmp.cpp:12
void SkipCommentLines()
Move stream pointer along until a non commented line is found.
Definition: fl.cpp:814
static bool IsWs(const char &Ch)
Definition: dt.h:1063
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:1022
char * CStr()
Definition: dt.h:255
TPt< TMIn > FInPt
Pointer to the input file stream.
Definition: ssmp.h:17
unsigned long long uint64
Definition: bd.h:38
TVec< char * > FldV
Pointers to fields of the current line.
Definition: ssmp.h:16
const char * GetFld(const int &FldN) const
Returns the contents of the field at index FldN.
Definition: ssmp.h:66
static int GetNum(const char &Ch)
Definition: dt.h:1069
Whitespace (space or tab) separated.
Definition: ss.h:11
uint64 CountNewLinesInRange(uint64 Lb, uint64 Ub)
Finds number of new line chars in interval [Lb, Ub)
Definition: fl.cpp:782
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
static PSIn New(const void *_Bf, const uint64 &_BfL, const bool &TakeBf=false)
Definition: fl.cpp:668
Space separated.
Definition: ss.h:10
void ToLc()
Transforms the current line to lower case.
Definition: ssmp.cpp:43
uint64 GetLineEndPos(uint64 Ind)
Finds end of line in which Ind is present.
Definition: fl.cpp:802
Definition: dt.h:201
uint64 GetLineStartPos(uint64 Ind)
Finds beginning of line in which Ind is present.
Definition: fl.cpp:795
uint64 CountNewLinesInRange(uint64 Lb, uint64 Ub) const
Counts number of occurences of ' ' in [Lb, Ub)
Definition: ssmp.cpp:102
int GetIntFromFldV(TVec< char * > &FieldsV, const int &FldN)
Gets integer at field FldN.
Definition: ssmp.cpp:152
void NextFromIndex(uint64 Index, TVec< char * > &FieldsV)
Loads next line starting from a given position.
Definition: ssmp.cpp:128
char * GetLine(uint64 Ind)
Definition: fl.cpp:810
Definition: dt.h:412
static TStr Fmt(const char *FmtStr,...)
Definition: dt.cpp:1599
bool SkipCmt
Skip comments (lines starting with #).
Definition: ssmp.h:11
char SplitCh
Separator character (if one of the non-started separators is used)
Definition: ssmp.h:14
~TSsParserMP()
Definition: ssmp.cpp:9
TSsFmt SsFmt
Separator type.
Definition: ssmp.h:9
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
void DelLast()
Removes the last element of the vector.
Definition: ds.h:665
uint64 GetStreamLen() const
Returns length of stream.
Definition: ssmp.h:93
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:430
double GetFltFromFldV(TVec< char * > &FieldsV, const int &FldN)
Gets float at field FldN.
Definition: ssmp.cpp:170