SNAP Library 3.0, User Reference  2016-07-20 17:56:49
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
Go to the documentation of this file.
2  if (Left != NULL) { Left->GetVariables(Variables); }
3  if (Right != NULL) { Right->GetVariables(Variables); }
4  if (Op == NOP) {
5  if (Atom.Lvar != "" ) { Variables.Add(Atom.Lvar); }
6  if (Atom.Rvar != "" ) { Variables.Add(Atom.Rvar); }
7  }
8 }
10 void TPredicate::GetVariables(TStrV& Variables) {
11  Root->GetVariables(Variables);
12 }
15  TPredicateNode* Curr = Root;
16  TPredicateNode* Prev = NULL;
17  while (!(Curr == NULL && Prev == Root)) {
18  // going down the tree
19  if (Prev == NULL || Prev == Curr->Parent) {
20  // left child exists and was not yet evaluated
21  if (Curr->Left != NULL) {
22  Prev = Curr;
23  Curr = Curr->Left;
24  } else if (Curr->Right != NULL) {
25  Prev = Curr;
26  Curr = Curr->Right;
27  } else {
28  Curr->Result = EvalAtomicPredicate(Curr->Atom);
29  Prev = Curr;
30  Curr = Curr->Parent;
31  }
32  } else if (Prev == Curr->Left) {
33  // going back up through left (first) child
34  switch (Curr->Op) {
35  case NOT: {
36  Assert(Curr->Right == NULL);
37  Curr->Result = !(Prev->Result);
38  Prev = Curr;
39  Curr = Curr->Parent;
40  break;
41  }
42  case AND: {
43  Assert(Curr->Right != NULL);
44  if (!Prev->Result) {
45  Curr->Result = false;
46  Prev = Curr;
47  Curr = Curr->Parent;
48  } else {
49  Prev = Curr;
50  Curr = Curr->Right;
51  }
52  break;
53  }
54  case OR: {
55  Assert(Curr->Right != NULL);
56  if (Prev->Result) {
57  Curr->Result = true;
58  Prev = Curr;
59  Curr = Curr->Parent;
60  } else {
61  Prev = Curr;
62  Curr = Curr->Right;
63  }
64  break;
65  }
66  case NOP: {
67  break;
68  }
69  }
70  } else {
71  // going back up the tree from right (second) child
72  Assert(Prev == Curr->Right);
73  switch (Curr->Op) {
74  case NOT: {
75  Assert(Curr->Left == NULL);
76  Curr->Result = !(Prev->Result);
77  break;
78  }
79  case AND: {
80  Assert(Curr->Left != NULL);
81  Assert(Curr->Left->Result);
82  Curr->Result = Prev->Result;
83  break;
84  }
85  case OR: {
86  Assert(Curr->Left != NULL);
87  Assert(!Curr->Left->Result);
88  Curr->Result = Prev->Result;
89  break;
90  }
91  case NOP: {
92  break;
93  }
94  }
95  Prev = Curr;
96  Curr = Curr->Parent;
97  }
98  }
99  return Root->Result;
100 }
103  switch (Atom.Type) {
104  case atInt: {
105  if (Atom.IsConst) {
106  return EvalAtom<TInt>(IntVars.GetDat(Atom.Lvar), Atom.IntConst, Atom.Compare);
107  }
108  return EvalAtom<TInt>(IntVars.GetDat(Atom.Lvar), IntVars.GetDat(Atom.Rvar), Atom.Compare);
109  }
110  case atFlt: {
111  if (Atom.IsConst) {
112  return EvalAtom<TFlt>(FltVars.GetDat(Atom.Lvar), Atom.FltConst, Atom.Compare);
113  }
114  return EvalAtom<TFlt>(FltVars.GetDat(Atom.Lvar), FltVars.GetDat(Atom.Rvar), Atom.Compare);
115  }
116  case atStr: {
117  if (Atom.IsConst) {
118  return EvalAtom<TStr>(StrVars.GetDat(Atom.Lvar), Atom.StrConst, Atom.Compare);
119  }
120  return EvalAtom<TStr>(StrVars.GetDat(Atom.Lvar), StrVars.GetDat(Atom.Rvar), Atom.Compare);
121  }
122  }
123  return false;
124 }
126 TInt const TTable::Last = -1;
127 TInt const TTable::Invalid = -2;
129 TInt TTable::UseMP = 1;
132  return this->Next();
133 }
137  //Assert(CurrRowIdx != TTable::Invalid);
138  return *this;
139 }
141 bool TRowIterator::operator < (const TRowIterator& RowI) const{
142  if (CurrRowIdx == TTable::Last) { return false; }
143  if (RowI.CurrRowIdx == TTable::Last) { return true; }
144  return CurrRowIdx < RowI.CurrRowIdx;
145 }
147 bool TRowIterator::operator == (const TRowIterator& RowI) const {
148  return CurrRowIdx == RowI.CurrRowIdx;
149 }
152  return CurrRowIdx;
153 }
154 // We do not check column type in the iterator.
156  return Table->IntCols[ColIdx][CurrRowIdx];
157 }
160  return Table->FltCols[ColIdx][CurrRowIdx];
161 }
164  return Table->GetStrVal(ColIdx, CurrRowIdx);
165 }
167 TInt TRowIterator::GetIntAttr(const TStr& Col) const {
168  TInt ColIdx = Table->GetColIdx(Col);
169  return Table->IntCols[ColIdx][CurrRowIdx];
170 }
172 TFlt TRowIterator::GetFltAttr(const TStr& Col) const {
173  TInt ColIdx = Table->GetColIdx(Col);
174  return Table->FltCols[ColIdx][CurrRowIdx];
175 }
177 TStr TRowIterator::GetStrAttr(const TStr& Col) const {
178  return Table->GetStrVal(Col, CurrRowIdx);
179 }
182  TInt ColIdx = Table->GetColIdx(Col);
183  return Table->StrColMaps[ColIdx][CurrRowIdx];
184 }
187  return Table->StrColMaps[ColIdx][CurrRowIdx];
188 }
191  TBool Result;
192  switch (Val.GetType()) {
193  case atInt:
194  Result = TPredicate::EvalAtom(GetIntAttr(ColIdx), Val.GetInt(), Cmp);
195  break;
196  case atFlt:
197  Result = TPredicate::EvalAtom(GetFltAttr(ColIdx), Val.GetFlt(), Cmp);
198  break;
199  case atStr:
200  Result = TPredicate::EvalStrAtom(GetStrAttr(ColIdx), Val.GetStr(), Cmp);
201  break;
202  default:
203  Result = TBool(false);
204  }
205  return Result;
206 }
209  TBool Result;
210  //printf("string compare\n");
211  Result = TPredicate::EvalStrAtom(GetStrAttr(ColIdx), Val, Cmp);
212  return Result;
213 }
216  CurrRowIdx(RowIdx), Table(TablePtr), Start(RowIdx == TablePtr->FirstValidRow) {}
219  return this->Next();
220 }
224  Start = false;
225  Assert(CurrRowIdx != TTable::Invalid);
226  return *this;
227 }
230  if (CurrRowIdx == TTable::Last) { return false; }
231  if (RowI.CurrRowIdx == TTable::Last) { return true; }
232  return CurrRowIdx < RowI.CurrRowIdx;
233 }
236  return CurrRowIdx == RowI.CurrRowIdx;
237 }
240  return CurrRowIdx;
241 }
244  return (Start ? Table->FirstValidRow : Table->Next[CurrRowIdx]);
245 }
247 // We do not check column type in the iterator.
249  return Table->IntCols[ColIdx][GetNextRowIdx()];
250 }
253  return Table->FltCols[ColIdx][GetNextRowIdx()];
254 }
257  return Table->GetStrVal(ColIdx, GetNextRowIdx());
258 }
261  TInt ColIdx = Table->GetColIdx(Col);
262  return Table->IntCols[ColIdx][GetNextRowIdx()];
263 }
266  TInt ColIdx = Table->GetColIdx(Col);
267  return Table->FltCols[ColIdx][GetNextRowIdx()];
268 }
271  return Table->GetStrVal(Col, GetNextRowIdx());
272 }
275  return CurrRowIdx == Table->FirstValidRow;
276 }
280 }
283  TBool Result;
284  switch (Val.GetType()) {
285  case atInt:
286  Result = TPredicate::EvalAtom(GetNextIntAttr(ColIdx), Val.GetInt(), Cmp);
287  break;
288  case atFlt:
289  Result = TPredicate::EvalAtom(GetNextFltAttr(ColIdx), Val.GetFlt(), Cmp);
290  break;
291  case atStr:
292  Result = TPredicate::EvalStrAtom(GetNextStrAttr(ColIdx), Val.GetStr(), Cmp);
293  break;
294  default:
295  Result = TBool(false);
296  }
297  return Result;
298 }
300 // Better not use default constructor as it leads to a memory leak.
301 // - OR - implement a destructor.
302 TTable::TTable(): Context(new TTableContext), NumRows(0), NumValidRows(0),
303  FirstValidRow(0), LastValidRow(-1) {}
305 TTable::TTable(TTableContext* Context): Context(Context), NumRows(0),
306  NumValidRows(0), FirstValidRow(0), LastValidRow(-1) {}
308 TTable::TTable(const Schema& TableSchema, TTableContext* Context): Context(Context),
309  NumRows(0), NumValidRows(0), FirstValidRow(0), LastValidRow(-1), IsNextDirty(0) {
310  TInt IntColCnt = 0;
311  TInt FltColCnt = 0;
312  TInt StrColCnt = 0;
313  for (TInt i = 0; i < TableSchema.Len(); i++) {
314  TStr ColName = TableSchema[i].Val1;
315  TAttrType ColType = TableSchema[i].Val2;
316  AddSchemaCol(ColName, ColType);
317  switch (ColType) {
318  case atInt:
319  AddColType(ColName, atInt, IntColCnt);
320  IntColCnt++;
321  break;
322  case atFlt:
323  AddColType(ColName, atFlt, FltColCnt);
324  FltColCnt++;
325  break;
326  case atStr:
327  AddColType(ColName, atStr, StrColCnt);
328  StrColCnt++;
329  break;
330  }
331  }
332  IntCols = TVec<TIntV>(IntColCnt);
333  FltCols = TVec<TFltV>(FltColCnt);
334  StrColMaps = TVec<TIntV>(StrColCnt);
335 }
337 TTable::TTable(TSIn& SIn, TTableContext* Context): Context(Context), NumRows(SIn),
338  NumValidRows(SIn), FirstValidRow(SIn), LastValidRow(SIn), Next(SIn), IntCols(SIn),
339  FltCols(SIn), StrColMaps(SIn) {
340  THash<TStr,TPair<TInt,TInt> > ColTypeIntMap(SIn);
342  ColTypeMap.Clr();
343  Sch.Clr();
344  for (THash<TStr,TPair<TInt,TInt> >::TIter it = ColTypeIntMap.BegI(); it < ColTypeIntMap.EndI(); it++) {
345  TPair<TInt,TInt> dat = it.GetDat();
346  switch (dat.GetVal1()) {
347  case 0:
348  AddColType(it.GetKey(), atInt, dat.GetVal2());
349  AddSchemaCol(it.GetKey(), atInt);
350  break;
351  case 1:
352  AddColType(it.GetKey(), atFlt, dat.GetVal2());
353  AddSchemaCol(it.GetKey(), atFlt);
354  break;
355  case 2:
356  AddColType(it.GetKey(), atStr, dat.GetVal2());
357  AddSchemaCol(it.GetKey(), atStr);
358  break;
359  }
360  }
362  IsNextDirty = 0;
363 }
365 TTable::TTable(const TIntIntH& H, const TStr& Col1, const TStr& Col2,
366  TTableContext* Context, const TBool IsStrKeys) : Context(Context), NumRows(H.Len()),
367  NumValidRows(H.Len()), FirstValidRow(0), LastValidRow(H.Len()-1) {
368  TAttrType KeyType = IsStrKeys ? atStr : atInt;
369  AddSchemaCol(Col1, KeyType);
370  AddSchemaCol(Col2, atInt);
371  AddColType(Col1, KeyType, 0);
372  AddColType(Col2, atInt, 1);
373  if (IsStrKeys) {
374  StrColMaps = TVec<TIntV>(1);
375  IntCols = TVec<TIntV>(1);
376  H.GetKeyV(StrColMaps[0]);
377  H.GetDatV(IntCols[0]);
378  } else {
379  IntCols = TVec<TIntV>(2);
380  H.GetKeyV(IntCols[0]);
381  H.GetDatV(IntCols[1]);
382  }
383  Next = TIntV(NumRows);
384  for (TInt i = 0; i < NumRows; i++) {
385  Next[i] = i+1;
386  }
387  Next[NumRows-1] = Last;
388  IsNextDirty = 0;
389  InitIds();
390 }
392 TTable::TTable(const TIntFltH& H, const TStr& Col1, const TStr& Col2,
393  TTableContext* Context, const TBool IsStrKeys) : Context(Context),
394  NumRows(H.Len()), NumValidRows(H.Len()), FirstValidRow(0), LastValidRow(H.Len()-1) {
395  TAttrType KeyType = IsStrKeys ? atStr : atInt;
396  AddSchemaCol(Col1, KeyType);
397  AddSchemaCol(Col2, atFlt);
398  AddColType(Col1, KeyType, 0);
399  AddColType(Col2, atFlt, 0);
400  if (IsStrKeys) {
401  StrColMaps = TVec<TIntV>(1);
402  H.GetKeyV(StrColMaps[0]);
403  } else {
404  IntCols = TVec<TIntV>(1);
405  H.GetKeyV(IntCols[0]);
406  }
407  FltCols = TVec<TFltV>(1);
408  H.GetDatV(FltCols[0]);
409  Next = TIntV(NumRows);
410  for (TInt i = 0; i < NumRows; i++) {
411  Next[i] = i+1;
412  }
413  Next[NumRows-1] = Last;
414  IsNextDirty = 0;
415  InitIds();
416 }
418 TTable::TTable(const TTable& Table, const TIntV& RowIDs) : Context(Table.Context),
419  Sch(Table.Sch), SrcCol(Table.SrcCol), DstCol(Table.DstCol), EdgeAttrV(Table.EdgeAttrV),
420  SrcNodeAttrV(Table.SrcNodeAttrV), DstNodeAttrV(Table.DstNodeAttrV),
421  CommonNodeAttrs(Table.CommonNodeAttrs) {
422  ColTypeMap = Table.ColTypeMap;
423  IntCols = TVec<TIntV>(Table.IntCols.Len());
424  FltCols = TVec<TFltV>(Table.FltCols.Len());
426  FirstValidRow = 0;
427  LastValidRow = -1;
428  NumRows = 0;
429  NumValidRows = 0;
430  AddSelectedRows(Table, RowIDs);
431  IsNextDirty = 0;
432  InitIds();
433 }
435 void TTable::GetSchema(const TStr& InFNm, Schema& S, const char& Separator) {
436  // Determine Attr Type
437  // Assume that the data is tab separated
438  TSsParser Ss(InFNm, '\t', false, false, false);
439  TInt rowsToPeek = 1000;
440  TInt currRow = 0;
441  TInt lastComment = 0;
442  while (Ss.Next()) {
443  if (Ss.IsCmt()) {
444  lastComment += 1;
445  }
446  else break;
447  }
448  if (Ss.Eof()) {TExcept::Throw("No Data to determine attribute types!");}
449  TInt numCols = Ss.GetFlds();
450  TVec<TAttrType> colAttrV(numCols);
451  colAttrV.PutAll(atInt);
452  while (true) {
453  for (TInt i = 0; i < numCols; i++) {
454  if (Ss.IsInt(i)) {
455  }
456  else if (Ss.IsFlt(i)) {
457  colAttrV[i] = atFlt;
458  }
459  else {
460  colAttrV[i] = atStr;
461  }
462  }
463  currRow++;
464  if (currRow > rowsToPeek || Ss.Eof()) break;
465  Ss.Next();
466  }
467  // Default Separator is tab
468  TSsParser SsNames(InFNm, Separator, false, false, false);
469  for (int i = 0; i < lastComment; i++) { SsNames.Next();}
470  TVec<TStr> attrV;
471  TStr first(SsNames[0]);
472  int begin = 0;
473  TStr comment('#');
474  if (first != comment) {
475  for (int i = 1; i < first.Len(); i++){
476  if (first[i] != ' ') { begin = i; break;}
477  }
478  attrV.Add(first.GetSubStr(begin));
479  }
480  for (int i = 1; i < SsNames.GetFlds(); i++) {attrV.Add(SsNames[i]);}
481  for (TInt i = 0; i < numCols; i++) {
482  S.Add(TPair<TStr,TAttrType>(attrV[i],colAttrV[i]));
483  }
484 }
486 #ifdef GCC_ATOMIC
487 void TTable::LoadSSPar(PTable& T, const Schema& S, const TStr& InFNm, const TIntV& RelevantCols,
488  const char& Separator, TBool HasTitleLine) {
489  // preloaded necessary variables
490  TInt RowLen = T->Sch.Len();
491  TVec<TAttrType> ColTypes = TVec<TAttrType>(RowLen);
492  for (TInt i = 0; i < RowLen; i++) {
493  ColTypes[i] = T->GetSchemaColType(i);
494  }
496  TSsParserMP Ss(InFNm, Separator);
497  Ss.SkipCommentLines();
499  // if title line (i.e. names of the columns) is included as first row in the
500  // input file - use it to validate schema
501  if (HasTitleLine) {
502  Ss.Next();
503  if (S.Len() != Ss.GetFlds()) {
504  printf("%s\n", Ss[0]); TExcept::Throw("Table Schema Mismatch!");
505  }
506  for (TInt i = 0; i < Ss.GetFlds(); i++) {
507  // remove carriage return char
508  TInt L = strlen(Ss[i]);
509  if (Ss[i][L-1] < ' ') { Ss[i][L-1] = 0; }
510  if (NormalizeColName(S[i].Val1) != NormalizeColName(Ss[i])) { TExcept::Throw("Table Schema Mismatch!"); }
511  }
512  }
514  // Divide remaining part of stream into equal sized chunks
515  // Find starting position in stream for each thread
516  int64 Cnt = 0;
517  uint64 Pos = Ss.GetStreamPos();
518  uint64 Len = Ss.GetStreamLen();
519  uint64 Rem = Len - Pos;
520  int NumThreads = omp_get_max_threads();
522  uint64 Delta = Rem / NumThreads;
523  if (Delta < 1) Delta = 1;
525  TVec<uint64> StartIntV(NumThreads);
526  TVec<uint64> LineCountV(NumThreads);
527  TVec<uint64> PrefixSumV(NumThreads);
529  StartIntV[0] = Pos;
530  for (int i = 1; i < NumThreads; i++) {
531  StartIntV[i] = StartIntV[i-1] + Delta;
532  }
533  StartIntV.Add(Len);
535  // Find number of lines handled by each thread
536  omp_set_num_threads(NumThreads);
537  #pragma omp parallel for schedule(dynamic) reduction(+:Cnt)
538  for (int i = 0; i < NumThreads; i++) {
539  LineCountV[i] = Ss.CountNewLinesInRange(StartIntV[i], StartIntV[i+1]);
540  Cnt += LineCountV[i];
541  }
543  // Calculate row index offsets for each thread
544  PrefixSumV[0] = 0;
545  for (int i = 1; i < NumThreads; i++) {
546  PrefixSumV[i] = PrefixSumV[i-1] + LineCountV[i-1];
547  }
548  Ss.SetStreamPos(Pos);
550  // allocate memory for columns
551  TInt IntColIdx = 0;
552  TInt FltColIdx = 0;
553  for (TInt i = 0; i < RowLen; i++) {
554  switch (ColTypes[i]) {
555  case atInt:
556  T->IntCols[IntColIdx].Gen(Cnt);
557  IntColIdx++;
558  break;
559  case atFlt:
560  T->FltCols[FltColIdx].Gen(Cnt);
561  FltColIdx++;
562  break;
563  case atStr:
564  break;
565  }
566  }
568  Cnt = 0;
569  omp_set_num_threads(NumThreads);
570  #pragma omp parallel for schedule(dynamic) reduction(+:Cnt)
571  for (int i = 0; i < NumThreads; i++) {
572  // calculate beginning of each line handled by thread
573  TVec<uint64> LineStartPosV = Ss.GetStartPosV(StartIntV[i], StartIntV[i+1]);
575  // parse line and fill rows
576  for (uint64 k = 0; k < (uint64) LineStartPosV.Len(); k++) {
577  TVec<char*> FieldsV;
578  Ss.NextFromIndex(LineStartPosV[k], FieldsV);
579  if (FieldsV.Len() != S.Len()) {
580  TExcept::Throw("Error reading tsv file");
581  }
582  TInt IntColIdx = 0;
583  TInt FltColIdx = 0;
584  TInt RowIdx = PrefixSumV[i] + k;
586  for (TInt j = 0; j < RowLen; j++) {
587  switch (ColTypes[j]) {
588  case atInt:
589  if (RelevantCols.Len() == 0) {
590  T->IntCols[IntColIdx][RowIdx] = \
591  (Ss.GetIntFromFldV(FieldsV, j));
592  } else {
593  T->IntCols[IntColIdx][RowIdx] = \
594  (Ss.GetIntFromFldV(FieldsV, RelevantCols[j]));
595  }
596  IntColIdx++;
597  break;
598  case atFlt:
599  if (RelevantCols.Len() == 0) {
600  T->FltCols[FltColIdx][RowIdx] = \
601  (Ss.GetFltFromFldV(FieldsV, j));
602  } else {
603  T->FltCols[FltColIdx][RowIdx] = \
604  (Ss.GetFltFromFldV(FieldsV, RelevantCols[j]));
605  }
606  FltColIdx++;
607  break;
608  case atStr:
609  TExcept::Throw("TTable::LoadSS:: Str Col found\n");
610  break;
611  }
612  }
613  Cnt++;
614  }
615  }
617  // set number of rows and "Next" vector
618  T->NumRows = Cnt;
619  T->NumValidRows = T->NumRows;
621  T->Next.Clr();
622  T->Next.Gen(Cnt);
624  omp_set_num_threads(NumThreads);
625  #pragma omp parallel for schedule(dynamic, 10000)
626  for (int64 i = 0; i < Cnt-1; i++) {
627  T->Next[i] = i+1;
628  }
629  T->IsNextDirty = 0;
630  T->Next[Cnt-1] = Last;
631  T->LastValidRow = T->NumRows - 1;
633  T->IdColName = "_id";
634  TInt IdCol = T->IntCols.Add();
635  T->IntCols[IdCol].Gen(Cnt);
637  // initialize ID column
638  omp_set_num_threads(NumThreads);
639  #pragma omp parallel for schedule(dynamic, 10000)
640  for (int64 i = 0; i < Cnt; i++) {
641  T->IntCols[IdCol][i] = i;
642  }
644  T->AddSchemaCol(T->IdColName, atInt);
645  T->AddColType(T->IdColName, atInt, T->IntCols.Len()-1);
646 }
647 #endif // GCC_ATOMIC
650  PTable& T, const Schema& S, const TStr& InFNm, const TIntV& RelevantCols,
651  const char& Separator, TBool HasTitleLine) {
652  // preloaded necessary variables
653  int RowLen = T->Sch.Len();
654  TVec<TAttrType> ColTypes = TVec<TAttrType>(RowLen);
655  for (int i = 0; i < RowLen; i++) {
656  ColTypes[i] = T->GetSchemaColType(i);
657  }
659  // Sequential load
660  TSsParser Ss(InFNm, Separator);
661  // if title line (i.e. names of the columns) is included as first row in the
662  // input file - use it to validate schema
663  if (HasTitleLine) {
664  Ss.Next();
665  if (S.Len() != Ss.GetFlds()) {
666  printf("%s\n", Ss[0]); TExcept::Throw("Table Schema Mismatch!");
667  }
668  for (int i = 0; i < Ss.GetFlds(); i++) {
669  // remove carriage return char
670  int L = strlen(Ss[i]);
671  if (Ss[i][L-1] < ' ') { Ss[i][L-1] = 0; }
672  if (NormalizeColName(S[i].Val1) != NormalizeColName(Ss[i])) { TExcept::Throw("Table Schema Mismatch!"); }
673  }
674  }
676  // populate table columns
677  //printf("starting to populate table\n");
678  uint64 Cnt = 0;
679  while (Ss.Next()) {
680  int IntColIdx = 0;
681  int FltColIdx = 0;
682  int StrColIdx = 0;
683  Assert(Ss.GetFlds() == S.Len()); // compiled only in debug
684  if (Ss.GetFlds() != S.Len()) {
685  printf("%s\n", Ss[S.Len()]); TExcept::Throw("Error reading tsv file");
686  }
687  for (int i = 0; i < RowLen; i++) {
688  switch (ColTypes[i]) {
689  case atInt:
690  if (RelevantCols.Len() == 0) {
691  T->IntCols[IntColIdx].Add(Ss.GetInt(i));
692  } else {
693  T->IntCols[IntColIdx].Add(Ss.GetInt(RelevantCols[i]));
694  }
695  IntColIdx++;
696  break;
697  case atFlt:
698  if (RelevantCols.Len() == 0) {
699  T->FltCols[FltColIdx].Add(Ss.GetFlt(i));
700  } else {
701  T->FltCols[FltColIdx].Add(Ss.GetFlt(RelevantCols[i]));
702  }
703  FltColIdx++;
704  break;
705  case atStr:
706  int ColIdx;
707  if (RelevantCols.Len() == 0) {
708  ColIdx = i;
709  } else {
710  ColIdx = RelevantCols[i];
711  }
712  TStr Sval = TStr(Ss[ColIdx]);
713  T->AddStrVal(StrColIdx, Sval);
714  StrColIdx++;
715  break;
716  }
717  }
718  Cnt += 1;
719  }
720  //printf("finished populating table\n");
721  // set number of rows and "Next" vector
722  T->NumRows = static_cast<int>(Cnt);
723  T->NumValidRows = T->NumRows;
725  T->Next.Clr();
726  T->Next.Gen(static_cast<int>(Cnt));
727  for (uint64 i = 0; i < Cnt-1; i++) {
728  T->Next[static_cast<int>(i)] = static_cast<int>(i+1);
729  }
730  T->IsNextDirty = 0;
731  T->Next[static_cast<int>(Cnt-1)] = Last;
732  T->LastValidRow = T->NumRows - 1;
734  T->InitIds();
735 }
737 PTable TTable::LoadSS(const Schema& S, const TStr& InFNm, TTableContext* Context,
738  const TIntV& RelevantCols, const char& Separator, TBool HasTitleLine) {
739  TVec<uint64> IntGroupByCols;
740  bool NoStringCols = true;
742  // find the schema for the new table which contains only relevant columns
743  Schema SR;
744  if (RelevantCols.Len() == 0) {
745  SR = S;
746  } else {
747  for (int i = 0; i < RelevantCols.Len(); i++) {
748  SR.Add(S[RelevantCols[i]]);
749  }
750  }
751  PTable T = New(SR, Context);
753  // find col types and check for string cols
754  for (int i = 0; i < SR.Len(); i++) {
755  if (T->GetSchemaColType(i) == atStr) {
756  NoStringCols = false;
757  break;
758  }
759  }
761  if (GetMP() && NoStringCols) {
762  // Right now, can load in parallel only in Linux (for mmap) and if
763  // there are no string columns
764 #ifdef GLib_LINUX
765  LoadSSPar(T, S, InFNm, RelevantCols, Separator, HasTitleLine);
766 #else
767  LoadSSSeq(T, S, InFNm, RelevantCols, Separator, HasTitleLine);
768 #endif
769  } else {
770  LoadSSSeq(T, S, InFNm, RelevantCols, Separator, HasTitleLine);
771  }
772  return T;
773 }
775 PTable TTable::LoadSS(const Schema& S, const TStr& InFNm, TTableContext* Context,
776  const char& Separator, TBool HasTitleLine) {
777  return LoadSS(S, InFNm, Context, TIntV(), Separator, HasTitleLine);
778 }
780 void TTable::SaveSS(const TStr& OutFNm) {
781  if (NumValidRows == 0) {
782  printf("Table is empty");
783  return;
784  }
785  FILE* F = fopen(OutFNm.CStr(), "w");
786  // debug
787  if (F == NULL) {
788  printf("failed to open file %s\n", OutFNm.CStr());
789  perror("fail ");
790  return;
791  }
793  Dump(F);
795 #if 0
796  Schema DSch = DenormalizeSchema();
798  TInt L = Sch.Len();
799  // print title (schema)
800  fprintf(F, "# ");
801  for (TInt i = 0; i < L-1; i++) {
802  fprintf(F, "%s\t", DSch[i].Val1.CStr());
803  }
804  fprintf(F, "%s\n", DSch[L-1].Val1.CStr());
805  // print table contents
806  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
807  for (TInt i = 0; i < L; i++) {
808  char C = (i == L-1) ? '\n' : '\t';
809  switch (GetSchemaColType(i)) {
810  case atInt: {
811  fprintf(F, "%d%c", RowI.GetIntAttr(GetSchemaColName(i)).Val, C);
812  break;
813  }
814  case atFlt: {
815  fprintf(F, "%f%c", RowI.GetFltAttr(GetSchemaColName(i)).Val, C);
816  break;
817  }
818  case atStr: {
819  fprintf(F, "%s%c", RowI.GetStrAttr(GetSchemaColName(i)).CStr(), C);
820  break;
821  }
822  }
823  }
824  }
825 #endif
826  fclose(F);
827 }
829 void TTable::SaveBin(const TStr& OutFNm) {
830  TFOut SOut(OutFNm);
831  Save(SOut);
832 }
834 void TTable::Save(TSOut& SOut) {
835  NumRows.Save(SOut);
836  NumValidRows.Save(SOut);
837  FirstValidRow.Save(SOut);
838  LastValidRow.Save(SOut);
839  Next.Save(SOut);
840  IntCols.Save(SOut);
841  FltCols.Save(SOut);
842  StrColMaps.Save(SOut);
844  THash<TStr,TPair<TInt,TInt> > ColTypeIntMap;
845  TInt atIntVal = TInt(0);
846  TInt atFltVal = TInt(1);
847  TInt atStrVal = TInt(2);
848  for (THash<TStr,TPair<TAttrType,TInt> >::TIter it = ColTypeMap.BegI(); it < ColTypeMap.EndI(); it++) {
849  TPair<TAttrType,TInt> dat = it.GetDat();
850  TStr DColName = DenormalizeColName(it.GetKey());
851  switch (dat.GetVal1()) {
852  case atInt:
853  ColTypeIntMap.AddDat(DColName, TPair<TInt,TInt>(atIntVal, dat.GetVal2()));
854  break;
855  case atFlt:
856  ColTypeIntMap.AddDat(DColName, TPair<TInt,TInt>(atFltVal, dat.GetVal2()));
857  break;
858  case atStr:
859  ColTypeIntMap.AddDat(DColName, TPair<TInt,TInt>(atStrVal, dat.GetVal2()));
860  break;
861  }
862  }
863  ColTypeIntMap.Save(SOut);
864  SOut.Flush();
865 }
867 void TTable::Dump(FILE *OutF) const {
868  TInt L = Sch.Len();
869  Schema DSch = DenormalizeSchema();
871  // LoadSS() will not throw away lines with #
872  //fprintf(OutF, "# Table: rows: %d, columns: %d\n", GetNumValidRows(), GetNodes());
873  // print title (schema), LoadSS() will take first line as (optional) schema
874  fprintf(OutF, "# ");
875  for (TInt i = 0; i < L-1; i++) {
876  fprintf(OutF, "%s\t", DSch[i].Val1.CStr());
877  }
878  fprintf(OutF, "%s\n", DSch[L-1].Val1.CStr());
879  // print table contents
880  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
881  for (TInt i = 0; i < L; i++) {
882  char C = (i == L-1) ? '\n' : '\t';
883  switch (GetSchemaColType(i)) {
884  case atInt: {
885  fprintf(OutF, "%d%c", RowI.GetIntAttr(GetSchemaColName(i)).Val, C);
886  break;
887  }
888  case atFlt: {
889  fprintf(OutF, "%f%c", RowI.GetFltAttr(GetSchemaColName(i)).Val, C);
890  break;
891  }
892  case atStr: {
893  fprintf(OutF, "%s%c", RowI.GetStrAttr(GetSchemaColName(i)).CStr(), C);
894  break;
895  }
896  }
897  }
898  }
899 }
902  TInt L = Sch.Len();
904 #if 0
905  // print table on the input, iterate over all columns
906  for (TInt i = 0; i < L; i++) {
907  // skip non-string columns
908  if (GetSchemaColType(i) != atStr) {
909  continue;
910  }
912  TInt ColIdx = GetColIdx(GetSchemaColName(i));
914  // iterate over all rows
915  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
916  TInt RowIdx = RowI.GetRowIdx();
917  TInt KeyId = StrColMaps[ColIdx][RowIdx];
918  printf("ChangeContext in %d %d %d .%s.\n",
919  ColIdx.Val, RowIdx.Val, KeyId.Val, GetStrVal(ColIdx, RowIdx).CStr());
920  }
921  }
922 #endif
924  // add strings to the new context, change values
925  // iterate over all columns
926  for (TInt i = 0; i < L; i++) {
927  // skip non-string columns
928  if (GetSchemaColType(i) != atStr) {
929  continue;
930  }
932  TInt ColIdx = GetColIdx(GetSchemaColName(i));
934  // iterate over all rows
935  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
936  TInt RowIdx = RowI.GetRowIdx();
937  // get the string
938  TStr Key = GetStrVal(ColIdx, RowIdx);
939  // add the string to the new context
940  TInt KeyId = TInt(NewContext->StringVals.AddKey(Key));
941  // change the value in the table
942  StrColMaps[ColIdx][RowIdx] = KeyId;
943  }
944  }
946  // set the new context
947  Context = NewContext;
948  return Context;
949 }
951 void TTable::AddStrVal(const TInt& ColIdx, const TStr& Key) {
952  TInt KeyId = TInt(Context->StringVals.AddKey(Key));
953  //printf("TTable::AddStrVal2 %d .%s. %d\n", ColIdx.Val, Key.CStr(), KeyId.Val);
954  StrColMaps[ColIdx].Add(KeyId);
955 }
957 void TTable::AddStrVal(const TStr& Col, const TStr& Key) {
958  if (GetColType(Col) != atStr) {
959  TExcept::Throw(Col + " is not a string valued column");
960  }
961  //printf("TTable::AddStrVal1 .%s. .%s.\n", Col.CStr(), Key.CStr());
962  AddStrVal(GetColIdx(Col), Key);
963 }
965 void TTable::AddGraphAttribute(const TStr& Attr, TBool IsEdge, TBool IsSrc, TBool IsDst) {
966  if (!IsColName(Attr)) { TExcept::Throw(Attr + ": No such column"); }
967  if (IsEdge) { EdgeAttrV.Add(NormalizeColName(Attr)); }
968  if (IsSrc) { SrcNodeAttrV.Add(NormalizeColName(Attr)); }
969  if (IsDst) { DstNodeAttrV.Add(NormalizeColName(Attr)); }
970 }
972 void TTable::AddGraphAttributeV(TStrV& Attrs, TBool IsEdge, TBool IsSrc, TBool IsDst) {
973  for (TInt i = 0; i < Attrs.Len(); i++) {
974  if (!IsColName(Attrs[i])) {
975  TExcept::Throw(Attrs[i] + ": no such column");
976  }
977  }
978  for (TInt i = 0; i < Attrs.Len(); i++) {
979  if (IsEdge) { EdgeAttrV.Add(NormalizeColName(Attrs[i])); }
980  if (IsSrc) { SrcNodeAttrV.Add(NormalizeColName(Attrs[i])); }
981  if (IsDst) { DstNodeAttrV.Add(NormalizeColName(Attrs[i])); }
982  }
983 }
986  TStrV IntNA = TStrV(IntCols.Len(),0);
987  for (TInt i = 0; i < SrcNodeAttrV.Len(); i++) {
988  TStr Attr = SrcNodeAttrV[i];
989  if (GetColType(Attr) == atInt) {
990  IntNA.Add(Attr);
991  }
992  }
993  return IntNA;
994 }
997  TStrV IntNA = TStrV(IntCols.Len(),0);
998  for (TInt i = 0; i < DstNodeAttrV.Len(); i++) {
999  TStr Attr = DstNodeAttrV[i];
1000  if (GetColType(Attr) == atInt) {
1001  IntNA.Add(Attr);
1002  }
1003  }
1004  return IntNA;
1005 }
1008  TStrV IntEA = TStrV(IntCols.Len(),0);
1009  for (TInt i = 0; i < EdgeAttrV.Len(); i++) {
1010  TStr Attr = EdgeAttrV[i];
1011  if (GetColType(Attr) == atInt) {
1012  IntEA.Add(Attr);
1013  }
1014  }
1015  return IntEA;
1016 }
1019  TStrV FltNA = TStrV(FltCols.Len(),0);
1020  for (TInt i = 0; i < SrcNodeAttrV.Len(); i++) {
1021  TStr Attr = SrcNodeAttrV[i];
1022  if (GetColType(Attr) == atFlt) {
1023  FltNA.Add(Attr);
1024  }
1025  }
1026  return FltNA;
1027 }
1030  TStrV FltNA = TStrV(FltCols.Len(),0);
1031  for (TInt i = 0; i < DstNodeAttrV.Len(); i++) {
1032  TStr Attr = DstNodeAttrV[i];
1033  if (GetColType(Attr) == atFlt) {
1034  FltNA.Add(Attr);
1035  }
1036  }
1037  return FltNA;
1038 }
1041  TStrV FltEA = TStrV(FltCols.Len(),0);;
1042  for (TInt i = 0; i < EdgeAttrV.Len(); i++) {
1043  TStr Attr = EdgeAttrV[i];
1044  if (GetColType(Attr) == atFlt) {
1045  FltEA.Add(Attr);
1046  }
1047  }
1048  return FltEA;
1049 }
1052  TStrV StrNA = TStrV(StrColMaps.Len(),0);
1053  for (TInt i = 0; i < SrcNodeAttrV.Len(); i++) {
1054  TStr Attr = SrcNodeAttrV[i];
1055  if (GetColType(Attr) == atStr) {
1056  StrNA.Add(Attr);
1057  }
1058  }
1059  return StrNA;
1060 }
1063  TStrV StrNA = TStrV(StrColMaps.Len(),0);
1064  for (TInt i = 0; i < DstNodeAttrV.Len(); i++) {
1065  TStr Attr = DstNodeAttrV[i];
1066  if (GetColType(Attr) == atStr) {
1067  StrNA.Add(Attr);
1068  }
1069  }
1070  return StrNA;
1071 }
1075  TStrV StrEA = TStrV(StrColMaps.Len(),0);
1076  for (TInt i = 0; i < EdgeAttrV.Len(); i++) {
1077  TStr Attr = EdgeAttrV[i];
1078  if (GetColType(Attr) == atStr) {
1079  StrEA.Add(Attr);
1080  }
1081  }
1082  return StrEA;
1083 }
1085 void TTable::Rename(const TStr& column, const TStr& NewLabel) {
1086  // This function is necessary, for example to take the union of two tables
1087  // where the attribute names don't match.
1088  if (!IsColName(column)) { TExcept::Throw("no such column " + column); }
1089  TPair<TAttrType,TInt> ColVal = GetColTypeMap(column);
1090  DelColType(column);
1091  AddColType(NewLabel, ColVal);
1092  TStr NColName = NormalizeColName(column);
1093  TStr NLabel = NormalizeColName(NewLabel);
1094  for (TInt c = 0; c < Sch.Len(); c++) {
1095  if (Sch[c].Val1 == NColName) {
1096  Sch.SetVal(c, TPair<TStr, TAttrType>(NLabel, Sch[c].Val2));
1097  break;
1098  }
1099  }
1100 }
1103  if (FirstValidRow == LastValidRow) {
1104  LastValidRow = -1;
1105  }
1107  TInt Old = FirstValidRow;
1109  Next[Old] = TTable::Invalid;
1110  NumValidRows--;
1111  TInt IdColIdx = GetColIdx(GetIdColName());
1112  RowIdMap.AddDat(IntCols[IdColIdx][Old], Invalid);
1113 }
1115 void TTable::RemoveRow(TInt RowIdx, TInt PrevRowIdx) {
1116  if (RowIdx == FirstValidRow) {
1117  RemoveFirstRow();
1118  return;
1119  }
1120  Assert(RowIdx != TTable::Invalid);
1121  if (RowIdx == TTable::Last) { return; }
1122  Next[PrevRowIdx] = Next[RowIdx];
1123  if (LastValidRow == RowIdx) {
1124  LastValidRow = RowIdx;
1125  }
1126  Next[RowIdx] = TTable::Invalid;
1127  NumValidRows--;
1128  TInt IdColIdx = GetColIdx(GetIdColName());
1129  RowIdMap.AddDat(IntCols[IdColIdx][RowIdx], Invalid);
1130 }
1132 void TTable::KeepSortedRows(const TIntV& KeepV) {
1133  TIntIntH KeepH(KeepV.Len());
1134  for (TInt i = 0; i < KeepV.Len(); i++) {
1135  KeepH.AddKey(KeepV[i]);
1136  }
1139  TInt KeepSize = 0;
1140  while (RowI.GetNextRowIdx() != Last) {
1141  if (KeepSize < KeepV.Len()) {
1142  if (KeepH.IsKey(RowI.GetNextRowIdx())) {
1143  KeepSize++;
1144  RowI++;
1145  } else {
1146  RowI.RemoveNext();
1147  }
1148  } else {
1149  // Covered all of KeepV. Remove the rest of the rows.
1150  // Current RowI.CurrRowIdx is the last element of KeepV.
1151  RowI.RemoveNext();
1152  }
1153  }
1154  LastValidRow = KeepV[KeepV.Len()-1];
1155 }
1157 void TTable::GetPartitionRanges(TIntPrV& Partitions, TInt NumPartitions) const {
1158  TInt PartitionSize = NumValidRows / (NumPartitions);
1159  if (NumValidRows % NumPartitions != 0) PartitionSize++;
1160  if (PartitionSize < 10) {
1161  PartitionSize = 10;
1162  NumPartitions = NumValidRows / PartitionSize;
1163  }
1164  Partitions.Reserve(NumPartitions+1);
1166  TInt currRow = FirstValidRow;
1167  TInt currStart = currRow;
1168  if (IsNextDirty) {
1169  TInt currCount = PartitionSize;
1170  while (currRow != TTable::Last) {
1171  if (currCount == 0) {
1172  Partitions.Add(TIntPr(currStart, currRow));
1173  currStart = currRow;
1174  currCount = PartitionSize;
1175  }
1176  currRow = Next[currRow];
1177  currCount--;
1178  }
1179  Partitions.Add(TIntPr(currStart, currRow));
1180  } else {
1181  // Optimize for the case when rows are logically in sequence.
1182  currRow += PartitionSize;
1183  while (currRow != TTable::Last && currRow < Next.Len()) {
1184  if (Next[currRow] == TTable::Invalid) { currRow++; continue; }
1185  Partitions.Add(TIntPr(currStart, currRow));
1186  currStart = currRow;
1187  currRow += PartitionSize;
1188  }
1189  Partitions.Add(TIntPr(currStart, TTable::Last));
1190  }
1191  //printf("Num partitions: %d\n", Partitions.Len());
1192 }
1194 /***** Grouping Utility functions ****/
1195 void TTable::GroupingSanityCheck(const TStr& GroupBy, const TAttrType& AttrType) const {
1196  if (!IsColName(GroupBy)) {
1197  TExcept::Throw("no such column " + GroupBy);
1198  }
1199  if (GetColType(GroupBy) != AttrType) {
1200  TExcept::Throw(GroupBy + " values are not of expected type");
1201  }
1202 }
1204 #ifdef GCC_ATOMIC
1205 void TTable::GroupByIntColMP(const TStr& GroupBy, THashMP<TInt, TIntV>& Grouping, TBool UsePhysicalIds) const {
1206  timeval timer0;
1207  gettimeofday(&timer0, NULL);
1208  double t1 = timer0.tv_sec + (timer0.tv_usec/1000000.0);
1209  //printf("X\n");
1210  TInt IdColIdx = GetColIdx(IdColName);
1211  TInt GroupByColIdx = GetColIdx(GroupBy);
1212  if(!UsePhysicalIds && IdColIdx < 0){
1213  TExcept::Throw("Grouping: Either use physical row ids, or have an id column");
1214  }
1215  //double startFn = omp_get_wtime();
1216  GroupingSanityCheck(GroupBy, atInt);
1217  TIntPrV Partitions;
1218  GetPartitionRanges(Partitions, 8*CHUNKS_PER_THREAD);
1219  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
1220  //double endPart = omp_get_wtime();
1221  //printf("Partition time = %f\n", endPart-startFn);
1223  Grouping.Gen(NumValidRows);
1224  //double endGen = omp_get_wtime();
1225  //printf("Gen time = %f\n", endGen-endPart);
1226  //printf("S\n");
1227  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD) //num_threads(1)
1228  for (int i = 0; i < Partitions.Len(); i++){
1229  TRowIterator RowI(Partitions[i].GetVal1(), this);
1230  TRowIterator EndI(Partitions[i].GetVal2(), this);
1231  while (RowI < EndI) {
1232  TInt idx = UsePhysicalIds ? RowI.GetRowIdx() : RowI.GetIntAttr(IdColIdx);
1233  // printf("updating grouping with key = %d, row_id = %d\n", RowI.GetIntAttr(GroupBy).Val, idx.Val);
1234  UpdateGrouping<TInt>(Grouping, RowI.GetIntAttr(GroupByColIdx), idx);
1235  RowI++;
1236  }
1237  }
1238  gettimeofday(&timer0, NULL);
1239  double t2 = timer0.tv_sec + (timer0.tv_usec/1000000.0);
1240  printf("Grouping time: %f\n", t2 - t1);
1241  //double endAdd = omp_get_wtime();
1242  //printf("Add time = %f\n", endAdd-endGen);
1243 }
1244 #endif // GCC_ATOMIC
1246 void TTable::Unique(const TStr& Col) {
1247  TIntV RemainingRows;
1248  TStr NCol = NormalizeColName(Col);
1249  switch (GetColType(NCol)) {
1250  case atInt: {
1251  TIntIntVH Grouping;
1252  GroupByIntCol(NCol, Grouping, TIntV(), true, true);
1253  for (TIntIntVH::TIter it = Grouping.BegI(); it < Grouping.EndI(); it++) {
1254  RemainingRows.Add(it->Dat[0]);
1255  }
1256  break;
1257  }
1258  case atFlt: {
1259  THash<TFlt,TIntV> Grouping;
1260  GroupByFltCol(NCol, Grouping, TIntV(), true, true);
1261  for (THash<TFlt,TIntV>::TIter it = Grouping.BegI(); it < Grouping.EndI(); it++) {
1262  RemainingRows.Add(it->Dat[0]);
1263  }
1264  break;
1265  }
1266  case atStr: {
1267  TIntIntVH Grouping;
1268  GroupByStrCol(NCol, Grouping, TIntV(), true, true);
1269  for (TIntIntVH::TIter it = Grouping.BegI(); it < Grouping.EndI(); it++) {
1270  RemainingRows.Add(it->Dat[0]);
1271  }
1272  break;
1273  }
1274  }
1275  KeepSortedRows(RemainingRows);
1276 }
1278 void TTable::Unique(const TStrV& Cols, TBool Ordered) {
1279  if(Cols.Len() == 1){
1280  Unique(Cols[0]);
1281  return;
1282  }
1283  TStrV NCols = NormalizeColNameV(Cols);
1285  TIntV UniqueVec;
1286  GroupAux(NCols, Grouping, Ordered, "", true, UniqueVec, true);
1287  KeepSortedRows(UniqueVec);
1288 }
1290 void TTable::StoreGroupCol(const TStr& GroupColName, const TVec<TPair<TInt, TInt> >& GroupAndRowIds) {
1291  // Add a column where the value of the i'th row is the group id of row i.
1293  TInt L = IntCols.Len();
1294  AddColType(GroupColName, atInt, L-1);
1295  // Store group id for each row.
1296  for (TInt i = 0; i < GroupAndRowIds.Len(); i++) {
1297  IntCols[L-1][GroupAndRowIds[i].Val2] = GroupAndRowIds[i].Val1;
1298  }
1299 }
1301 // Core crouping logic.
1302 void TTable::GroupAux(const TStrV& GroupBy, THash<TGroupKey, TPair<TInt, TIntV> >& Grouping,
1303  TBool Ordered, const TStr& GroupColName, TBool KeepUnique, TIntV& UniqueVec, TBool UsePhysicalIds) {
1304  TInt IdColIdx = GetColIdx(IdColName);
1305  if(!UsePhysicalIds && IdColIdx < 0){
1306  TExcept::Throw("Grouping: Either use physical row ids, or have an id column");
1307  }
1308  TIntV IntGroupByCols;
1309  TIntV FltGroupByCols;
1310  TIntV StrGroupByCols;
1311  // get indices for each column type
1312  for (TInt c = 0; c < GroupBy.Len(); c++) {
1313  //printf("GroupBy col %d: %s\n", c.Val, GroupBy[c].CStr());
1314  if (!IsColName(GroupBy[c])) {
1315  TExcept::Throw("no such column " + GroupBy[c]);
1316  }
1318  TPair<TAttrType, TInt> ColType = GetColTypeMap(GroupBy[c]);
1319  switch (ColType.Val1) {
1320  case atInt:
1321  IntGroupByCols.Add(ColType.Val2);
1322  break;
1323  case atFlt:
1324  FltGroupByCols.Add(ColType.Val2);
1325  break;
1326  case atStr:
1327  StrGroupByCols.Add(ColType.Val2);
1328  break;
1329  }
1330  }
1332  TInt IKLen = IntGroupByCols.Len();
1333  TInt FKLen = FltGroupByCols.Len();
1334  TInt SKLen = StrGroupByCols.Len();
1336  TInt GroupNum = 0;
1337  TVec<TPair<TInt, TInt> > GroupAndRowIds;
1338  //printf("done GroupAux initialization\n");
1340  // iterate over rows
1341  for (TRowIterator it = BegRI(); it < EndRI(); it++) {
1342  TIntV IKey(IKLen + SKLen, 0);
1343  TFltV FKey(FKLen, 0);
1344  TIntV SKey(SKLen, 0);
1346  // find group key
1347  for (TInt c = 0; c < IKLen; c++) {
1348  IKey.Add(it.GetIntAttr(IntGroupByCols[c]));
1349  }
1350  for (TInt c = 0; c < FKLen; c++) {
1351  FKey.Add(it.GetFltAttr(FltGroupByCols[c]));
1352  }
1353  for (TInt c = 0; c < SKLen; c++) {
1354  SKey.Add(it.GetStrMapById(StrGroupByCols[c]));
1355  }
1356  if (!Ordered) {
1357  if (IKLen > 0) { IKey.ISort(0, IKey.Len()-1, true); }
1358  if (FKLen > 0) { FKey.ISort(0, FKey.Len()-1, true); }
1359  if (SKLen > 0) { SKey.ISort(0, SKey.Len()-1, true); }
1360  }
1361  for (TInt c = 0; c < SKLen; c++) {
1362  IKey.Add(SKey[c]);
1363  }
1365  // look for group matching the key
1366  TGroupKey GroupKey = TGroupKey(IKey, FKey);
1368  TInt RowIdx = it.GetRowIdx();
1369  TInt idx = UsePhysicalIds ? it.GetRowIdx() : IntCols[IdColIdx][it.GetRowIdx()];
1370  if (!Grouping.IsKey(GroupKey)) {
1371  // Grouping key hasn't been seen before, create a new group
1372  TPair<TInt, TIntV> NewGroup;
1373  NewGroup.Val1 = GroupNum;
1374  NewGroup.Val2.Add(idx);
1375  Grouping.AddDat(GroupKey, NewGroup);
1376  if (GroupColName != "") {
1377  GroupAndRowIds.Add(TPair<TInt, TInt>(GroupNum, RowIdx));
1378  }
1379  if (KeepUnique) {
1380  UniqueVec.Add(idx);
1381  }
1382  GroupNum++;
1383  } else {
1384  // Grouping key has been seen before, update corresponding group
1385  if (!KeepUnique) {
1386  TPair<TInt, TIntV>& NewGroup = Grouping.GetDat(GroupKey);
1387  NewGroup.Val2.Add(idx);
1388  if (GroupColName != "") {
1389  GroupAndRowIds.Add(TPair<TInt, TInt>(NewGroup.Val1, RowIdx));
1390  }
1391  }
1392  }
1393  }
1394  // printf("KeepUnique: %d\n", KeepUnique.Val);
1395  // update group mapping
1396  if (!KeepUnique) {
1397  GroupStmt Stmt(NormalizeColNameV(GroupBy), Ordered, UsePhysicalIds);
1398  GroupStmtNames.AddDat(GroupColName, Stmt);
1399  GroupIDMapping.AddKey(Stmt);
1400  GroupMapping.AddKey(Stmt);
1401  //printf("Adding statement: ");
1402  //Stmt.Print();
1403  for (THash<TGroupKey, TPair<TInt, TIntV> >::TIter it = Grouping.BegI(); it < Grouping.EndI(); it++) {
1404  TGroupKey key = it.GetKey();
1405  TPair<TInt, TIntV> group = it.GetDat();
1406  GroupIDMapping.GetDat(Stmt).AddDat(group.Val1, TGroupKey(key));
1407  GroupMapping.GetDat(Stmt).AddDat(TGroupKey(key), TIntV(group.Val2));
1408  }
1409  }
1411  // add a column to the table
1412  if (GroupColName != "") {
1413  StoreGroupCol(GroupColName, GroupAndRowIds);
1414  AddSchemaCol(GroupColName, atInt); // update schema
1415  }
1416 }
1418 /*
1419 // Core grouping logic.
1420 #ifdef USE_OPENMP
1421 void TTable::GroupAuxMP(const TStrV& GroupBy, THashGenericMP<TGroupKey, TPair<TInt, TIntV> >& Grouping,
1422  TBool Ordered, const TStr& GroupColName, TBool KeepUnique, TIntV& UniqueVec, TBool UsePhysicalIds) {
1423  //double startFn = omp_get_wtime();
1424  TIntV IntGroupByCols;
1425  TIntV FltGroupByCols;
1426  TIntV StrGroupByCols;
1427  // get indices for each column type
1428  for (TInt c = 0; c < GroupBy.Len(); c++) {
1429  if (!IsColName(GroupBy[c])) {
1430  TExcept::Throw("no such column " + GroupBy[c]);
1431  }
1433  TPair<TAttrType, TInt> ColType = GetColTypeMap(GroupBy[c]);
1434  switch (ColType.Val1) {
1435  case atInt:
1436  IntGroupByCols.Add(ColType.Val2);
1437  break;
1438  case atFlt:
1439  FltGroupByCols.Add(ColType.Val2);
1440  break;
1441  case atStr:
1442  StrGroupByCols.Add(ColType.Val2);
1443  break;
1444  }
1445  }
1447  TInt IKLen = IntGroupByCols.Len();
1448  TInt FKLen = FltGroupByCols.Len();
1449  TInt SKLen = StrGroupByCols.Len();
1451  TInt GroupNum = 0;
1452  TInt IdColIdx = GetColIdx(IdColName);
1454  //double endInit = omp_get_wtime();
1455  //printf("Init time = %f\n", endInit-startFn);
1457  TVec<TPair<TInt, TInt> > GroupAndRowIds;
1459  // iterate over rows
1460  for (TRowIterator it = BegRI(); it < EndRI(); it++) {
1461  TIntV IKey(IKLen + SKLen, 0);
1462  TFltV FKey(FKLen, 0);
1463  TIntV SKey(SKLen, 0);
1465  // find group key
1466  for (TInt c = 0; c < IKLen; c++) {
1467  IKey.Add(it.GetIntAttr(IntGroupByCols[c]));
1468  }
1469  for (TInt c = 0; c < FKLen; c++) {
1470  FKey.Add(it.GetFltAttr(FltGroupByCols[c]));
1471  }
1472  for (TInt c = 0; c < SKLen; c++) {
1473  SKey.Add(it.GetStrMapById(StrGroupByCols[c]));
1474  }
1475  if (!Ordered) {
1476  if (IKLen > 0) { IKey.ISort(0, IKey.Len()-1, true); }
1477  if (FKLen > 0) { FKey.ISort(0, FKey.Len()-1, true); }
1478  if (SKLen > 0) { SKey.ISort(0, SKey.Len()-1, true); }
1479  }
1480  for (TInt c = 0; c < SKLen; c++) {
1481  IKey.Add(SKey[c]);
1482  }
1484  // look for group matching the key
1485  TGroupKey GroupKey = TGroupKey(IKey, FKey);
1487  TInt RowIdx = it.GetRowIdx();
1488  if (!Grouping.IsKey(GroupKey)) {
1489  // Grouping key hasn't been seen before, create a new group
1490  TPair<TInt, TIntV> NewGroup;
1491  NewGroup.Val1 = GroupNum;
1492  if(IdColIdx > 0){
1493  NewGroup.Val2.Add(IntCols[IdColIdx][RowIdx]);
1494  }
1495  Grouping.AddDat(GroupKey, NewGroup);
1496  if (GroupColName != "") {
1497  GroupAndRowIds.Add(TPair<TInt, TInt>(GroupNum, RowIdx));
1498  }
1499  if (KeepUnique) {
1500  UniqueVec.Add(RowIdx);
1501  }
1502  GroupNum++;
1503  } else {
1504  // Grouping key has been seen before, update corresponding group
1505  if (!KeepUnique) {
1506  TPair<TInt, TIntV>& NewGroup = Grouping.GetDat(GroupKey);
1507  if(IdColIdx > 0){
1508  NewGroup.Val2.Add(IntCols[IdColIdx][RowIdx]);
1509  }
1510  if (GroupColName != "") {
1511  GroupAndRowIds.Add(TPair<TInt, TInt>(NewGroup.Val1, RowIdx));
1512  }
1513  }
1514  }
1515  }
1517  //double endIter = omp_get_wtime();
1518  //printf("Iter time = %f\n", endIter-endInit);
1520  // update group mapping
1521  if (!KeepUnique) {
1522  TPair<TStrV, TBool> GroupStmt(GroupBy, Ordered);
1523  GroupStmtNames.AddDat(GroupColName, GroupStmt);
1524  GroupIDMapping.AddDat(GroupStmt);
1525  GroupMapping.AddDat(GroupStmt);
1526  for (THash<TGroupKey, TPair<TInt, TIntV> >::TIter it = Grouping.BegI(); it < Grouping.EndI(); it++) {
1527  TGroupKey key = it.GetKey();
1528  TPair<TInt, TIntV> group = it.GetDat();
1529  GroupIDMapping.GetDat(GroupStmt).AddDat(group.Val1, key);
1530  GroupMapping.GetDat(GroupStmt).AddDat(key, group.Val2);
1531  }
1532  }
1534  //double endMapping = omp_get_wtime();
1535  //printf("Mapping time = %f\n", endMapping-endIter);
1537  // add a column to the table
1538  if (GroupColName != "") {
1539  StoreGroupCol(GroupColName, GroupAndRowIds);
1540  AddSchemaCol(GroupColName, atInt); // update schema
1541  }
1543  //double endStore = omp_get_wtime();
1544  //printf("Store time = %f\n", endStore-endMapping);
1545 }
1546 #endif // USE_OPENMP
1547 */
1549 void TTable::Group(const TStrV& GroupBy, const TStr& GroupColName, TBool Ordered, TBool UsePhysicalIds) {
1550  TStrV NGroupBy = NormalizeColNameV(GroupBy);
1551  TStr NGroupColName = NormalizeColName(GroupColName);
1552  TIntV UniqueVec;
1554  GroupAux(NGroupBy, Grouping, Ordered, NGroupColName, false, UniqueVec, UsePhysicalIds);
1555 }
1558  //TODO
1559 }
1562  //TODO
1563 }
1565 void TTable::Aggregate(const TStrV& GroupByAttrs, TAttrAggr AggOp,
1566  const TStr& ValAttr, const TStr& ResAttr, TBool Ordered) {
1568  for (TInt c = 0; c < GroupByAttrs.Len(); c++) {
1569  if (!IsColName(GroupByAttrs[c])) {
1570  TExcept::Throw("no such column " + GroupByAttrs[c]);
1571  }
1572  }
1574  // double startFn = omp_get_wtime();
1575  TStrV NGroupByAttrs = NormalizeColNameV(GroupByAttrs);
1576  TBool UsePhysicalIds = (GetColIdx(IdColName) < 0);
1578  THash<TInt,TIntV> GroupByIntMapping;
1579  THash<TFlt,TIntV> GroupByFltMapping;
1580  THash<TInt,TIntV> GroupByStrMapping;
1581  THash<TGroupKey,TIntV> Mapping;
1582 #ifdef GCC_ATOMIC
1583  THashMP<TInt,TIntV> GroupByIntMapping_MP(NumValidRows);
1584  TIntV GroupByIntMPKeys(NumValidRows);
1585 #endif
1586  TInt NumOfGroups = 0;
1587  TInt GroupingCase = 0;
1589  // check if grouping already exists
1590  GroupStmt Stmt(NGroupByAttrs, Ordered, UsePhysicalIds);
1591  if (GroupMapping.IsKey(Stmt)) {
1592  Mapping = GroupMapping.GetDat(Stmt);
1593  } else{
1594  if(NGroupByAttrs.Len() == 1){
1595  switch(GetColType(NGroupByAttrs[0])){
1596  case atInt:
1597 #ifdef GCC_ATOMIC
1598  if(GetMP()){
1599  GroupByIntColMP(NGroupByAttrs[0], GroupByIntMapping_MP, UsePhysicalIds);
1600  int x = 0;
1601  for(THashMP<TInt,TIntV>::TIter it = GroupByIntMapping_MP.BegI(); it < GroupByIntMapping_MP.EndI(); it++){
1602  GroupByIntMPKeys[x] = it.GetKey();
1603  x++;
1604  /*
1605  printf("%d --> ", it.GetKey().Val);
1606  TIntV& V = it.GetDat();
1607  for(int i = 0; i < V.Len(); i++){
1608  printf(" %d", V[i].Val);
1609  }
1610  printf("\n");
1611  */
1612  }
1613  NumOfGroups = x;
1614  GroupingCase = 4;
1615  //printf("Number of groups: %d\n", NumOfGroups.Val);
1616  break;
1617  }
1618 #endif // GCC_ATOMIC
1619  GroupByIntCol(NGroupByAttrs[0], GroupByIntMapping, TIntV(), true, UsePhysicalIds);
1620  NumOfGroups = GroupByIntMapping.Len();
1621  GroupingCase = 1;
1622  break;
1623  case atFlt:
1624  GroupByFltCol(NGroupByAttrs[0], GroupByFltMapping, TIntV(), true, UsePhysicalIds);
1625  NumOfGroups = GroupByFltMapping.Len();
1626  GroupingCase = 2;
1627  break;
1628  case atStr:
1629  GroupByStrCol(NGroupByAttrs[0], GroupByStrMapping, TIntV(), true, UsePhysicalIds);
1630  NumOfGroups = GroupByStrMapping.Len();
1631  GroupingCase = 3;
1632  break;
1633  }
1634  }
1635  else{
1636  TIntV UniqueVector;
1638  GroupAux(NGroupByAttrs, Mapping_aux, Ordered, "", false, UniqueVector, UsePhysicalIds);
1639  for(THash<TGroupKey, TPair<TInt, TIntV> >::TIter it = Mapping_aux.BegI(); it < Mapping_aux.EndI(); it++){
1640  Mapping.AddDat(it.GetKey(), it.GetDat().Val2);
1641  }
1642  NumOfGroups = Mapping.Len();
1643  }
1644  }
1646  // double endGroup = omp_get_wtime();
1647  // printf("Group time = %f\n", endGroup-startFn);
1649  TAttrType T = GetColType(ValAttr);
1651  // add column corresponding to result attribute type
1652  if (AggOp == aaCount) { AddIntCol(ResAttr); }
1653  else {
1654  if (T == atInt) { AddIntCol(ResAttr); }
1655  else if (T == atFlt) { AddFltCol(ResAttr); }
1656  else {
1657  // Count is the only aggregation operation handled for Str
1658  TExcept::Throw("Invalid aggregation for Str type!");
1659  }
1660  }
1661  TInt ColIdx = GetColIdx(ResAttr);
1662  TInt AggrColIdx = GetColIdx(ValAttr);
1664  // double endAdd = omp_get_wtime();
1665  // printf("AddCol time = %f\n", endAdd-endGroup);
1667 #ifdef USE_OPENMP
1668  #pragma omp parallel for schedule(dynamic)
1669 #endif
1670  for (int g = 0; g < NumOfGroups; g++) {
1671  TIntV* GroupRows = NULL;
1672  switch(GroupingCase){
1673  case 0:
1674  GroupRows = & Mapping.GetDat(Mapping.GetKey(g));
1675  break;
1676  case 1:
1677  GroupRows = & GroupByIntMapping.GetDat(GroupByIntMapping.GetKey(g));
1678  break;
1679  case 2:
1680  GroupRows = & GroupByIntMapping.GetDat(GroupByIntMapping.GetKey(g));
1681  break;
1682  case 3:
1683  GroupRows = & GroupByStrMapping.GetDat(GroupByStrMapping.GetKey(g));
1684  break;
1685  case 4:
1686 #ifdef GCC_ATOMIC
1687  GroupRows = & GroupByIntMapping_MP.GetDat(GroupByIntMPKeys[g]);
1688 #endif
1689  break;
1690  }
1692  // find valid rows of group
1693  /*
1694  TIntV ValidRows;
1695  for (TInt i = 0; i < GroupRows.Len(); i++) {
1696  // TODO: This should not be necessary
1697  if (!RowIdMap.IsKey(GroupRows[i])) { continue; }
1698  TInt RowId = RowIdMap.GetDat(GroupRows[i]);
1699  // GroupRows has physical row indices
1700  if (RowId != Invalid) { ValidRows.Add(RowId); }
1701  }
1702  */
1703  TIntV& ValidRows = *GroupRows;
1704  TInt sz = ValidRows.Len();
1705  if (sz <= 0) continue;
1706  // Count is handled separately (other operations have aggregation policies defined in a template)
1707  if (AggOp == aaCount) {
1708  for (TInt i = 0; i < sz; i++) { IntCols[ColIdx][ValidRows[i]] = sz; }
1709  } else {
1710  // aggregate based on column type
1711  if (T == atInt) {
1712  TIntV V;
1713  for (TInt i = 0; i < sz; i++) { V.Add(IntCols[AggrColIdx][ValidRows[i]]); }
1714  TInt Res = AggregateVector<TInt>(V, AggOp);
1715  if (AggOp == aaMean) { Res = Res / sz; }
1716  for (TInt i = 0; i < sz; i++) { IntCols[ColIdx][ValidRows[i]] = Res; }
1717  } else {
1718  TFltV V;
1719  for (TInt i = 0; i < sz; i++) { V.Add(FltCols[AggrColIdx][ValidRows[i]]); }
1720  TFlt Res = AggregateVector<TFlt>(V, AggOp);
1721  if (AggOp == aaMean) { Res /= sz; }
1722  for (TInt i = 0; i < sz; i++) { FltCols[ColIdx][ValidRows[i]] = Res; }
1723  }
1724  }
1725  }
1726  // double endIter = omp_get_wtime();
1727  // printf("Iter time = %f\n", endIter-endAdd);
1728 }
1730 void TTable::AggregateCols(const TStrV& AggrAttrs, TAttrAggr AggOp, const TStr& ResAttr) {
1732  for (TInt i = 0; i < AggrAttrs.Len(); i++) {
1733  Info.Add(GetColTypeMap(AggrAttrs[i]));
1734  if (Info[i].Val1 != Info[0].Val1) {
1735  TExcept::Throw("AggregateCols: Aggregation attributes must have the same type");
1736  }
1737  }
1739  if (Info[0].Val1 == atInt) {
1740  AddIntCol(ResAttr);
1741  TInt ResIdx = GetColIdx(ResAttr);
1743  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
1744  TInt RowIdx = RI.GetRowIdx();
1745  TIntV V;
1746  for (TInt i = 0; i < AggrAttrs.Len(); i++) {
1747  V.Add(IntCols[Info[i].Val2][RowIdx]);
1748  }
1749  IntCols[ResIdx][RowIdx] = AggregateVector<TInt>(V, AggOp);
1750  }
1751  } else if (Info[0].Val1 == atFlt) {
1752  AddFltCol(ResAttr);
1753  TInt ResIdx = GetColIdx(ResAttr);
1755  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
1756  TInt RowIdx = RI.GetRowIdx();
1757  TFltV V;
1758  for (TInt i = 0; i < AggrAttrs.Len(); i++) {
1759  V.Add(FltCols[Info[i].Val2][RowIdx]);
1760  }
1761  FltCols[ResIdx][RowIdx] = AggregateVector<TFlt>(V, AggOp);
1762  }
1763  } else {
1764  TExcept::Throw("AggregateCols: Only Int and Flt aggregation supported right now");
1765  }
1766 }
1769  for(THash<TGroupKey, TIntV>::TIter it = Mapping.BegI(); it < Mapping.EndI(); it++){
1770  TGroupKey gk = it.GetKey();
1771  TIntV ik = gk.Val1;
1772  TFltV fk = gk.Val2;
1773  for(int i = 0; i < ik.Len(); i++){ printf("%d ",ik[i].Val);}
1774  for(int i = 0; i < fk.Len(); i++){ printf("%f ",fk[i].Val);}
1775  printf("-->");
1776  TIntV v = it.GetDat();
1777  for(int i = 0; i < v.Len(); i++){ printf("%d ",v[i].Val);}
1778  printf("\n");
1779  }
1780 }
1782 void TTable::Count(const TStr& CountColName, const TStr& Col) {
1783  TStrV GroupByAttrs;
1784  GroupByAttrs.Add(CountColName);
1785  Aggregate(GroupByAttrs, aaCount, "", Col);
1786 }
1788 TVec<PTable> TTable::SpliceByGroup(const TStrV& GroupBy, TBool Ordered) {
1789  TStrV NGroupBy = NormalizeColNameV(GroupBy);
1790  TIntV UniqueVec;
1792  TVec<PTable> Result;
1794  Schema NewSchema;
1795  for (TInt c = 0; c < Sch.Len(); c++) {
1796  if (Sch[c].Val1 != GetIdColName()) {
1797  NewSchema.Add(Sch[c]);
1798  }
1799  }
1801  GroupAux(NGroupBy, Grouping, Ordered, "", false, UniqueVec);
1803  TInt cnt = 0;
1804  // iterate over groups
1805  for (THash<TGroupKey, TPair<TInt, TIntV> >::TIter it = Grouping.BegI(); it != Grouping.EndI(); it++) {
1806  PTable GroupTable = TTable::New(NewSchema, Context);
1808  TVec<TPair<TAttrType, TInt> > ColInfo;
1809  TIntV V;
1810  for (TInt i = 0; i < Sch.Len(); i++) {
1811  ColInfo.Add(GroupTable->GetColTypeMap(Sch[i].Val1));
1812  if (Sch[i].Val1 == IdColName()) {
1813  ColInfo[i].Val2 = -1;
1814  }
1815  V.Add(GetColIdx(Sch[i].Val1));
1816  }
1818  TIntV& Rows = it.GetDat().Val2;
1820  // iterate over rows in group
1821  for (TInt i = 0; i < Rows.Len(); i++) {
1822  // convert from permanent ID to row ID
1823  TInt RowIdx = RowIdMap.GetDat(Rows[i]);
1825  // iterate over schema
1826  for (TInt c = 0; c < Sch.Len(); c++) {
1827  TPair<TAttrType, TInt> Info = ColInfo[c];
1828  TInt ColIdx = Info.Val2;
1830  if (ColIdx == -1) { continue; }
1832  // add row to new group
1833  switch (Info.Val1) {
1834  case atInt:
1835  GroupTable->IntCols[ColIdx].Add(IntCols[V[c]][RowIdx]);
1836  break;
1837  case atFlt:
1838  GroupTable->FltCols[ColIdx].Add(FltCols[V[c]][RowIdx]);
1839  break;
1840  case atStr:
1841  GroupTable->StrColMaps[ColIdx].Add(StrColMaps[V[c]][RowIdx]);
1842  break;
1843  }
1845  }
1846  if (GroupTable->LastValidRow >= 0) {
1847  GroupTable->Next[GroupTable->LastValidRow] = GroupTable->NumRows;
1848  }
1849  GroupTable->Next.Add(GroupTable->Last);
1850  GroupTable->LastValidRow = GroupTable->NumRows;
1852  GroupTable->NumRows++;
1853  GroupTable->NumValidRows++;
1854  }
1855  GroupTable->InitIds();
1856  Result.Add(GroupTable);
1858  cnt += 1;
1859  }
1860  return Result;
1861 }
1864  IdColName = "_id";
1865  //Assert(NumRows == NumValidRows);
1867 }
1870  RowIdMap.Clr();
1871  TInt IdColIdx = GetColIdx(IdColName);
1872  TInt IdCnt = 0;
1873  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
1874  IntCols[IdColIdx][RI.GetRowIdx()] = IdCnt;
1875  RowIdMap.AddDat(RI.GetRowIdx(), IdCnt);
1876  IdCnt++;
1877  }
1878 }
1880 void TTable::AddIdColumn(const TStr& ColName) {
1881  //printf("NumRows: %d\n", NumRows.Val);
1882  TInt IdCol = IntCols.Add();
1883  IntCols[IdCol].Reserve(NumRows, NumRows);
1884  //printf("IdCol Reserved\n");
1885  TInt IdCnt = 0;
1886  RowIdMap.Clr();
1887  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
1888  IntCols[IdCol][RI.GetRowIdx()] = IdCnt;
1889  RowIdMap.AddDat(IdCnt, RI.GetRowIdx());
1890  IdCnt++;
1891  }
1892  AddSchemaCol(ColName, atInt);
1893  AddColType(ColName, atInt, IntCols.Len()-1);
1894 }
1897  PTable JointTable = New(Context);
1898  JointTable->IntCols = TVec<TIntV>(IntCols.Len() + Table.IntCols.Len() + 1);
1899  JointTable->FltCols = TVec<TFltV>(FltCols.Len() + Table.FltCols.Len());
1900  JointTable->StrColMaps = TVec<TIntV>(StrColMaps.Len() + Table.StrColMaps.Len());
1901  for (TInt i = 0; i < Sch.Len(); i++) {
1902  TStr ColName = GetSchemaColName(i);
1903  TAttrType ColType = GetSchemaColType(i);
1904  TStr CName = JointTable->RenumberColName(ColName);
1905  TPair<TAttrType, TInt> TypeMap = GetColTypeMap(ColName);
1906  JointTable->AddColType(CName, TypeMap);
1907  //JointTable->AddLabel(CName, ColName);
1908  JointTable->AddSchemaCol(CName, ColType);
1909  }
1910  for (TInt i = 0; i < Table.Sch.Len(); i++) {
1911  TStr ColName = Table.GetSchemaColName(i);
1912  TAttrType ColType = Table.GetSchemaColType(i);
1913  TStr CName = JointTable->RenumberColName(ColName);
1914  TPair<TAttrType, TInt> NewDat = Table.GetColTypeMap(ColName);
1915  Assert(ColType == NewDat.Val1);
1916  // add offsets
1917  switch (NewDat.Val1) {
1918  case atInt:
1919  NewDat.Val2 += IntCols.Len();
1920  break;
1921  case atFlt:
1922  NewDat.Val2 += FltCols.Len();
1923  break;
1924  case atStr:
1925  NewDat.Val2 += StrColMaps.Len();
1926  break;
1927  }
1928  JointTable->AddColType(CName, NewDat);
1929  JointTable->AddSchemaCol(CName, ColType);
1930  }
1931  TStr IdColName = "_id";
1932  JointTable->AddColType(IdColName, atInt, IntCols.Len() + Table.IntCols.Len());
1933  JointTable->AddSchemaCol(IdColName, atInt);
1934  return JointTable;
1935 }
1937 void TTable::AddJointRow(const TTable& T1, const TTable& T2, TInt RowIdx1, TInt RowIdx2) {
1938  for (TInt i = 0; i < T1.IntCols.Len(); i++) {
1939  IntCols[i].Add(T1.IntCols[i][RowIdx1]);
1940  }
1941  for (TInt i = 0; i < T1.FltCols.Len(); i++) {
1942  FltCols[i].Add(T1.FltCols[i][RowIdx1]);
1943  }
1944  for (TInt i = 0; i < T1.StrColMaps.Len(); i++) {
1945  StrColMaps[i].Add(T1.StrColMaps[i][RowIdx1]);
1946  }
1947  TInt IntOffset = T1.IntCols.Len();
1948  TInt FltOffset = T1.FltCols.Len();
1949  TInt StrOffset = T1.StrColMaps.Len();
1950  for (TInt i = 0; i < T2.IntCols.Len(); i++) {
1951  IntCols[i+IntOffset].Add(T2.IntCols[i][RowIdx2]);
1952  }
1953  for (TInt i = 0; i < T2.FltCols.Len(); i++) {
1954  FltCols[i+FltOffset].Add(T2.FltCols[i][RowIdx2]);
1955  }
1956  for (TInt i = 0; i < T2.StrColMaps.Len(); i++) {
1957  StrColMaps[i+StrOffset].Add(T2.StrColMaps[i][RowIdx2]);
1958  }
1959  TInt IdOffset = IntOffset + T2.IntCols.Len();
1960  NumRows++;
1961  NumValidRows++;
1962  if (!Next.Empty()) {
1963  Next[Next.Len()-1] = NumValidRows-1;
1965  }
1966  Next.Add(Last);
1968  IntCols[IdOffset].Add(NumRows-1);
1969 }
1974 PTable TTable::SimJoin(const TStrV& Cols1, const TTable& Table, const TStrV& Cols2, const TStr& DistanceColName, const TSimType& SimType, const TFlt& Threshold)
1975 {
1976  Assert(Cols1.Len() == Cols2.Len());
1978  if(Cols1.Len()!=Cols2.Len()){
1979  TExcept::Throw("Column vectors must match in type and length");
1980  }
1982  for (TInt i = 0; i < Cols1.Len(); i++) {
1983  if(!IsColName(Cols1[i]) || !Table.IsColName(Cols2[i])){
1984  TExcept::Throw("Column not found in Table");
1985  }
1987  TAttrType Type1 = GetColType(Cols1[i]);
1988  TAttrType Type2 = GetColType(Cols2[i]);
1990  if(Type1!=Type2){
1991  TExcept::Throw("Column types on the two tables must match.");
1992  }
1994  // When supporting more distance metrics, check if the types are supported for given metric.
1995  if((Type1!=atInt && Type1!=atFlt) || (Type2!=atInt && Type2!=atFlt)){
1996  TExcept::Throw("Column type not supported. Only Flt and Int column types are supported.");
1997  }
1998  }
2000  // Initialize Join table and add the similarity column
2001  PTable JointTable = InitializeJointTable(Table);
2002  TFltV DistanceV;
2004  // O(n^2): Parallelize
2005  for(TRowIterator RowI = this->BegRI(); RowI < this->EndRI(); RowI++) {
2006  for(TRowIterator RowI2 = Table.BegRI(); RowI2 < Table.EndRI(); RowI2++) {
2007  float distance = 0;
2009  switch(SimType)
2010  {
2011  // Calculate the distance metric
2012  case L2Norm:
2013  for(TInt i = 0; i < Cols1.Len(); i++) {
2014  float attrVal1, attrVal2;
2015  attrVal1 = GetColType(Cols1[i])==atInt ? (float)RowI.GetIntAttr(Cols1[i]) : (float)RowI.GetFltAttr(Cols1[i]);
2016  attrVal2 = Table.GetColType(Cols2[i])==atInt ? (float)RowI2.GetIntAttr(Cols2[i]) : (float)RowI2.GetFltAttr(Cols2[i]);
2017  distance += pow(attrVal1 - attrVal2, 2);
2018  }
2020  distance = sqrt(distance);
2022  if(distance<=Threshold){
2023  JointTable->AddJointRow(*this, Table, RowI.GetRowIdx(), RowI2.GetRowIdx());
2024  DistanceV.Add(distance);
2025  }
2027  // Add row to the joint table if distance <= Threshold
2028  break;
2029  // Haversine distance to calculate the distance between two points on Earth from latitude/longitude
2030  case Haversine:
2031  {
2032  if(Cols1.Len()!=2){
2033  TExcept::Throw("Haversine disance expects exactly two attributes - latitude and longitude - in that order.");
2034  }
2036  // Block to prevent cross-initialization error from compiler
2037  TFlt Radius = 6373; // km
2038  float Latitude1 = GetColType(Cols1[0])==atInt ? (float)RowI.GetIntAttr(Cols1[0]) : (float)RowI.GetFltAttr(Cols1[0]);
2039  float Latitude2 = Table.GetColType(Cols2[0])==atInt ? (float)RowI2.GetIntAttr(Cols2[0]) : (float)RowI2.GetFltAttr(Cols2[0]);
2041  float Longitude1 = GetColType(Cols1[1])==atInt ? (float)RowI.GetIntAttr(Cols1[1]) : (float)RowI.GetFltAttr(Cols1[1]);
2042  float Longitude2 = Table.GetColType(Cols2[1])==atInt ? (float)RowI2.GetIntAttr(Cols2[1]) : (float)RowI2.GetFltAttr(Cols2[1]);
2044  Latitude1 *= static_cast<float>(M_PI/180.0);
2045  Latitude2 *= static_cast<float>(M_PI/180.0);
2046  Longitude1 *= static_cast<float>(M_PI/180.0);
2047  Longitude2 *= static_cast<float>(M_PI/180.0);
2049  float dlon = Longitude2 - Longitude1;
2050  float dlat = Latitude2 - Latitude1;
2051  float a = pow(sin(dlat/2), 2) + cos(Latitude1)*cos(Latitude2)*pow(sin(dlon/2), 2);
2052  float c = 2*atan2(sqrt(a), sqrt(1-a));
2053  distance = (static_cast<float>(Radius.Val))*c;
2055  if(distance<=Threshold){
2056  JointTable->AddJointRow(*this, Table, RowI.GetRowIdx(), RowI2.GetRowIdx());
2057  DistanceV.Add(distance);
2058  }
2059  }
2060  break;
2061  case L1Norm:
2062  case Jaccard:
2063  TExcept::Throw("This distance metric is not supported");
2064  }
2065  }
2066  }
2068  // Add the value for the similarity column
2069  JointTable->StoreFltCol(DistanceColName, DistanceV);
2070  JointTable->InitIds();
2071  return JointTable;
2072 }
2074 PTable TTable::SelfSimJoinPerGroup(const TStr& GroupAttr, const TStr& SimCol, const TStr& DistanceColName, const TSimType& SimType, const TFlt& Threshold)
2075 {
2076  if(!IsColName(SimCol) || !IsColName(GroupAttr)){
2077  TExcept::Throw("No such column found in table");
2078  }
2080  PTable JointTable = New(Context);
2081  // Initialize the joint table - (GroupId1, GroupId2, Similarity)
2082  JointTable->IntCols = TVec<TIntV>(2);
2083  JointTable->FltCols = TVec<TFltV>(1);
2085  for(TInt i=0;i<2;i++){
2086  TInt Suffix = i+1;
2087  TStr CName = "GroupId_" + Suffix.GetStr();
2089  JointTable->AddColType(CName, Group);
2090  JointTable->AddSchemaCol(CName, atInt);
2091  }
2094  JointTable->AddColType(DistanceColName, Group);
2095  JointTable->AddSchemaCol(DistanceColName, atFlt);
2099  TAttrType attrType = GetColType(SimCol);
2100  TInt GroupColIdx = GetColIdx(GroupAttr);
2101  TInt SimColIdx = GetColIdx(SimCol);
2103  for (TRowIterator RowI = this->BegRI(); RowI < this->EndRI(); RowI++) {
2104  TInt GroupId = IntCols[GroupColIdx][RowI.GetRowIdx()];
2106  if(attrType==atInt || attrType==atStr)
2107  {
2108  if(!TIntHH.IsKey(GroupId)){
2110  TIntHH.AddDat(GroupId, TIntH);
2111  }
2113  THash<TInt, TInt>& TIntH = TIntHH.GetDat(GroupId);
2114  TInt SimAttrVal = (attrType==atInt ? IntCols[SimColIdx][RowI.GetRowIdx()] : StrColMaps[SimColIdx][RowI.GetRowIdx()]);
2115  TIntH.AddDat(SimAttrVal, 0);
2116  }
2117  else
2118  {
2119  TExcept::Throw("Attribute type not supported.");
2120  }
2121  }
2123  // Iterate through every pair of groups and calculate the distance
2124  for (THash<TInt, THash<TInt, TInt> >::TIter it1 = TIntHH.BegI(); it1 < TIntHH.EndI(); it1++) {
2125  THash<TInt, TInt> Vals1H = it1.GetDat();
2126  TInt GroupId1 = it1.GetKey();
2128  for (THash<TInt, THash<TInt, TInt> >::TIter it2 = TIntHH.BegI(); it2 < TIntHH.EndI(); it2++) {
2129  int intersectionCount = 0;
2130  TInt GroupId2 = it2.GetKey();
2131  THash<TInt, TInt> Vals2H = it2.GetDat();
2133  for(THash<TInt, TInt>::TIter it = Vals1H.BegI(); it < Vals1H.EndI(); it++)
2134  {
2135  TInt Val = it.GetKey();
2136  if(Vals2H.IsKey(Val)){
2137  intersectionCount+=1;
2138  }
2139  }
2141  int unionCount = Vals1H.Len() + Vals2H.Len() - intersectionCount;
2142  float distance = 1.0f - (float)intersectionCount/unionCount;
2144  // Add a new row to the JointTable
2145  if(distance<=Threshold){
2146  JointTable->IntCols[0].Add(GroupId1);
2147  JointTable->IntCols[1].Add(GroupId2);
2148  JointTable->FltCols[0].Add(distance);
2149  JointTable->IncrementNext();
2150  }
2151  }
2152  }
2154  JointTable->InitIds();
2155  return JointTable;
2156 }
2160 PTable TTable::SelfSimJoinPerGroup(const TStrV& GroupBy, const TStr& SimCol,
2161  const TStr& DistanceColName, const TSimType& SimType, const TFlt& Threshold) {
2162  TStrV NGroupBy = NormalizeColNameV(GroupBy);
2163  TStrV ProjectionV;
2165  // Only keep the GroupBy cols and the SimCol
2166  for(TInt i=0; i<GroupBy.Len(); i++)
2167  {
2168  ProjectionV.Add(GroupBy[i]);
2169  }
2171  ProjectionV.Add(SimCol);
2172  ProjectInPlace(ProjectionV);
2174  TStr CName = "Group";
2175  TIntV UniqueVec;
2177  GroupAux(NGroupBy, Grouping, false, CName, false, UniqueVec);
2178  PTable GroupJointTable = SelfSimJoinPerGroup(CName, SimCol, DistanceColName, SimType, Threshold);
2179  PTable JointTable = InitializeJointTable(*this);
2181  // Hash of groupid to any arbitrary row of that group. Arbitrary because the GroupBy
2182  // columns within that group are the same, so we can choose any one.
2183  THash<TInt, TInt> GroupIdH;
2185  for(THash<TGroupKey, TPair<TInt, TIntV> >::TIter it=Grouping.BegI(); it<Grouping.EndI(); it++)
2186  {
2187  TPair<TInt, TIntV> group = it.GetDat();
2188  TInt GroupNum = group.Val1;
2189  TIntV RowIds = group.Val2;
2191  if(!GroupIdH.IsKey(GroupNum))
2192  {
2193  TInt RandomRowId = RowIds[0]; // Arbitrarily select the 1st row.
2194  GroupIdH.AddDat(GroupNum, RandomRowId);
2195  }
2196  }
2198  for(TRowIterator RowI = GroupJointTable->BegRI(); RowI < GroupJointTable->EndRI(); RowI++)
2199  {
2200  // The GroupJoinTable has a well defined structure - columns 0 and 1 are GroupIds
2201  TInt GroupId1 = GroupJointTable->IntCols[0][RowI.GetRowIdx()];
2202  TInt GroupId2 = GroupJointTable->IntCols[1][RowI.GetRowIdx()];
2204  // Get the rows for groupid1 and groupid and arbitrary select one row
2205  TInt RowId1 = GroupIdH.GetDat(GroupId1);
2206  TInt RowId2 = GroupIdH.GetDat(GroupId2);
2207  JointTable->AddJointRow(*this, *this, RowId1, RowId2);
2208  }
2210  // Add the simiarlity column from the GroupJointTable - GroupJointTable has a
2211  // well defined structure - The first float column is the similarity;
2212  JointTable->StoreFltCol(DistanceColName, GroupJointTable->FltCols[0]);
2213  ProjectionV.Clr();
2214  ProjectionV.Add(DistanceColName);
2216  // Find the GroupBy columns in the JointTable by matching the Suffix of the Schema
2217  // columns with the original GroupBy columns - Note that Join renames columns.
2218  for(TInt i=0; i<GroupBy.Len(); i++){
2219  for(TInt j=0; j<JointTable->Sch.Len(); j++)
2220  {
2221  TStr ColName = JointTable->Sch[j].Val1;
2222  if(ColName.IsStrIn(GroupBy[i]))
2223  {
2224  ProjectionV.Add(ColName);
2225  }
2226  }
2227  }
2229  JointTable->ProjectInPlace(ProjectionV);
2230  JointTable->InitIds();
2231  return JointTable;
2232 }
2234 // Increments the next vector and set last, NumRows and NumValidRows.
2236 {
2237  // Advance the Next vector
2238  NumRows++;
2239  NumValidRows++;
2240  if (!Next.Empty()) {
2241  Next[Next.Len()-1] = NumValidRows-1;
2243  }
2244  Next.Add(Last);
2245 }
2247 // Q: Do we want to have any gurantees in terms of order of the 0t rows - i.e.
2248 // ordered by "this" table row idx as primary key and "Table" row idx as secondary key
2249  // This means only keeping joint row indices (pairs of original row indices), sorting them
2250  // and adding all rows in the end. Sorting can be expensive, but we would be able to pre-allocate
2251  // memory for the joint table..
2252 PTable TTable::Join(const TStr& Col1, const TTable& Table, const TStr& Col2) {
2253  // double startFn = omp_get_wtime();
2254  if (!IsColName(Col1)) {
2255  TExcept::Throw("no such column " + Col1);
2256  printf("no such column %s\n", Col1.CStr());
2257  }
2258  if (!Table.IsColName(Col2)) {
2259  TExcept::Throw("no such column " + Col2);
2260  printf("no such column %s\n", Col2.CStr());
2261  }
2262  if (GetColType(Col1) != Table.GetColType(Col2)) {
2263  TExcept::Throw("Trying to Join on columns of different type");
2264  printf("Trying to Join on columns of different type\n");
2265  }
2266  //printf("passed initial checks\n");
2267  // initialize result table
2268  PTable JointTable = InitializeJointTable(Table);
2269  //printf("initialized joint table\n");
2270  // hash smaller table (group by column)
2271  TAttrType ColType = GetColType(Col1);
2272  TBool ThisIsSmaller = (NumValidRows <= Table.NumValidRows);
2273  const TTable& TS = ThisIsSmaller ? *this : Table;
2274  const TTable& TB = ThisIsSmaller ? Table : *this;
2275  TStr ColS = ThisIsSmaller ? Col1 : Col2;
2276  TStr ColB = ThisIsSmaller ? Col2 : Col1;
2277  TInt ColBId = ThisIsSmaller ? Table.GetColIdx(ColB) : GetColIdx(ColB);
2278  // double endInit = omp_get_wtime();
2279  // printf("Init time = %f\n", endInit-startFn);
2280  // iterate over the rows of the bigger table and check for "collisions"
2281  // with the group keys for the small table.
2282 #ifdef GCC_ATOMIC
2283  if (GetMP()) {
2284  switch(ColType){
2285  case atInt:{
2287  TS.GroupByIntColMP(ColS, T, true);
2288  // double endGroup = omp_get_wtime();
2289  // printf("Group time = %f\n", endGroup-endInit);
2291  TIntPrV Partitions;
2292  TB.GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
2293  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
2294  TVec<TIntPrV> JointRowIDSet(Partitions.Len());
2295  // double endPart = omp_get_wtime();
2296  // printf("Partition time = %f\n", endPart-endGroup);
2298  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD)
2299  for (int i = 0; i < Partitions.Len(); i++){
2300  //double start = omp_get_wtime();
2301  JointRowIDSet[i].Reserve(PartitionSize);
2302  TRowIterator RowI(Partitions[i].GetVal1(), &TB);
2303  TRowIterator EndI(Partitions[i].GetVal2(), &TB);
2304  while (RowI < EndI) {
2305  TInt K = RowI.GetIntAttr(ColBId);
2306  if(T.IsKey(K)){
2307  TIntV& Group = T.GetDat(K);
2308  for(TInt j = 0; j < Group.Len(); j++){
2309  if(ThisIsSmaller){
2310  JointRowIDSet[i].Add(TIntPr(Group[j], RowI.GetRowIdx()));
2311  } else{
2312  JointRowIDSet[i].Add(TIntPr(RowI.GetRowIdx(), Group[j]));
2313  }
2314  }
2315  }
2316  RowI++;
2317  }
2318  //double end = omp_get_wtime();
2319  //printf("END: Thread %d: i = %d, start = %d, end = %d, num = %d, time = %f\n", omp_get_thread_num(), i,
2320  // Partitions[i].GetVal1().Val, Partitions[i].GetVal2().Val, JointRowIDSet[i].Len(), end-start);
2321  }
2322  // double endJoin = omp_get_wtime();
2323  // printf("Iterate time = %f\n", endJoin-endPart);
2324  JointTable->AddNJointRowsMP(*this, Table, JointRowIDSet);
2325  // double endAdd = omp_get_wtime();
2326  // printf("Add time = %f\n", endAdd-endJoin);
2327  break;
2328  }
2329  case atFlt:{
2331  TS.GroupByFltCol(ColS, T, TIntV(), true);
2333  TIntPrV Partitions;
2334  TB.GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
2335  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
2336  TVec<TIntPrV> JointRowIDSet(Partitions.Len());
2338  #pragma omp parallel for schedule(dynamic)
2339  for (int i = 0; i < Partitions.Len(); i++){
2340  JointRowIDSet[i].Reserve(PartitionSize);
2341  TRowIterator RowI(Partitions[i].GetVal1(), &TB);
2342  TRowIterator EndI(Partitions[i].GetVal2(), &TB);
2343  while (RowI < EndI) {
2344  TFlt K = RowI.GetFltAttr(ColBId);
2345  if(T.IsKey(K)){
2346  TIntV& Group = T.GetDat(K);
2347  for(TInt j = 0; j < Group.Len(); j++){
2348  if(ThisIsSmaller){
2349  JointRowIDSet[i].Add(TIntPr(Group[j], RowI.GetRowIdx()));
2350  } else{
2351  JointRowIDSet[i].Add(TIntPr(RowI.GetRowIdx(), Group[j]));
2352  }
2353  }
2354  }
2355  RowI++;
2356  }
2357  }
2358  JointTable->AddNJointRowsMP(*this, Table, JointRowIDSet);
2359  break;
2360  }
2361  case atStr:{
2363  TS.GroupByStrCol(ColS, T, TIntV(), true);
2365  TIntPrV Partitions;
2366  TB.GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
2367  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
2368  TVec<TIntPrV> JointRowIDSet(Partitions.Len());
2370  #pragma omp parallel for schedule(dynamic)
2371  for (int i = 0; i < Partitions.Len(); i++){
2372  JointRowIDSet[i].Reserve(PartitionSize);
2373  TRowIterator RowI(Partitions[i].GetVal1(), &TB);
2374  TRowIterator EndI(Partitions[i].GetVal2(), &TB);
2375  while (RowI < EndI) {
2376  TInt K = RowI.GetStrMapById(ColBId);
2377  if(T.IsKey(K)){
2378  TIntV& Group = T.GetDat(K);
2379  for(TInt j = 0; j < Group.Len(); j++){
2380  if(ThisIsSmaller){
2381  JointRowIDSet[i].Add(TIntPr(Group[j], RowI.GetRowIdx()));
2382  } else{
2383  JointRowIDSet[i].Add(TIntPr(RowI.GetRowIdx(), Group[j]));
2384  }
2385  }
2386  }
2387  RowI++;
2388  }
2389  }
2390  JointTable->AddNJointRowsMP(*this, Table, JointRowIDSet);
2391  }
2392  break;
2393  }
2394  } else {
2395 #endif // GCC_ATOMIC
2396  switch (ColType) {
2397  case atInt:{
2398  TIntIntVH T;
2399  TS.GroupByIntCol(ColS, T, TIntV(), true);
2400  for (TRowIterator RowI = TB.BegRI(); RowI < TB.EndRI(); RowI++) {
2401  TInt K = RowI.GetIntAttr(ColBId);
2402  if (T.IsKey(K)) {
2403  TIntV& Group = T.GetDat(K);
2404  for (TInt i = 0; i < Group.Len(); i++) {
2405  if (ThisIsSmaller) {
2406  JointTable->AddJointRow(*this, Table, Group[i], RowI.GetRowIdx());
2407  } else {
2408  JointTable->AddJointRow(*this, Table, RowI.GetRowIdx(), Group[i]);
2409  }
2410  }
2411  }
2412  }
2413  break;
2414  }
2415  case atFlt:{
2417  TS.GroupByFltCol(ColS, T, TIntV(), true);
2418  for (TRowIterator RowI = TB.BegRI(); RowI < TB.EndRI(); RowI++) {
2419  TFlt K = RowI.GetFltAttr(ColBId);
2420  if (T.IsKey(K)) {
2421  TIntV& Group = T.GetDat(K);
2422  for (TInt i = 0; i < Group.Len(); i++) {
2423  if (ThisIsSmaller) {
2424  JointTable->AddJointRow(*this, Table, Group[i], RowI.GetRowIdx());
2425  } else {
2426  JointTable->AddJointRow(*this, Table, RowI.GetRowIdx(), Group[i]);
2427  }
2428  }
2429  }
2430  }
2431  break;
2432  }
2433  case atStr:{
2434  TIntIntVH T;
2435  TS.GroupByStrCol(ColS, T, TIntV(), true);
2436  for (TRowIterator RowI = TB.BegRI(); RowI < TB.EndRI(); RowI++) {
2437  TInt K = RowI.GetStrMapById(ColBId);
2438  if (T.IsKey(K)) {
2439  TIntV& Group = T.GetDat(K);
2440  for (TInt i = 0; i < Group.Len(); i++) {
2441  if (ThisIsSmaller) {
2442  JointTable->AddJointRow(*this, Table, Group[i], RowI.GetRowIdx());
2443  } else {
2444  JointTable->AddJointRow(*this, Table, RowI.GetRowIdx(), Group[i]);
2445  }
2446  }
2447  }
2448  }
2449  }
2450  break;
2451  }
2452 #ifdef GCC_ATOMIC
2453  }
2454 #endif
2455  return JointTable;
2456 }
2458 void TTable::ThresholdJoinInputCorrectness(const TStr& KeyCol1, const TStr& JoinCol1, const TTable& Table,
2459  const TStr& KeyCol2, const TStr& JoinCol2){
2460  if (!IsColName(KeyCol1)) {
2461  printf("no such column %s\n", KeyCol1.CStr());
2462  TExcept::Throw("no such column " + KeyCol1);
2463  }
2464  if (!Table.IsColName(KeyCol2)) {
2465  printf("no such column %s\n", KeyCol2.CStr());
2466  TExcept::Throw("no such column " + KeyCol2);
2467  }
2468  if (!IsColName(JoinCol1)) {
2469  printf("no such column %s\n", JoinCol1.CStr());
2470  TExcept::Throw("no such column " + JoinCol1);
2471  }
2472  if (!Table.IsColName(JoinCol2)) {
2473  printf("no such column %s\n", JoinCol2.CStr());
2474  TExcept::Throw("no such column " + JoinCol2);
2475  }
2476  if (GetColType(JoinCol1) != Table.GetColType(JoinCol2)) {
2477  printf("Trying to Join on columns of different type\n");
2478  TExcept::Throw("Trying to Join on columns of different type");
2479  }
2480  if (GetColType(KeyCol1) != Table.GetColType(KeyCol2)) {
2481  printf("Key type mismatch\n");
2482  TExcept::Throw("Key type mismatch");
2483  }
2484 }
2487  const TIntIntVH& T, TInt JoinColIdxB, TInt KeyColIdxB, TInt KeyColIdxS,
2488  THash<TIntPr,TIntTr>& Counters, TBool ThisIsSmaller, TAttrType JoinColType, TAttrType KeyType){
2489  // iterate over big table and count / record joint tuples
2490  for (TRowIterator RowI = TB.BegRI(); RowI < TB.EndRI(); RowI++) {
2491  // value to join on from big table
2492  TInt JVal = 0;
2493  if(JoinColType == atStr){
2494  JVal = RowI.GetStrMapById(JoinColIdxB);
2495  } else{
2496  JVal = RowI.GetIntAttr(JoinColIdxB);
2497  }
2498  //printf("JVal: %d\n", JVal.Val);
2499  if(T.IsKey(JVal)){
2500  // read key attribute of big table row
2501  TInt KeyB = 0;
2502  if(KeyType == atStr){
2503  KeyB = RowI.GetStrMapById(KeyColIdxB);
2504  } else{
2505  KeyB = RowI.GetIntAttr(KeyColIdxB);
2506  }
2507  // read row ids from small table with join attribute value of JVal
2508  const TIntV& RelevantRows = T.GetDat(JVal);
2509  for(int i = 0; i < RelevantRows.Len(); i++){
2510  // read key attribute of relevant row from small table
2511  TInt KeyS = 0;
2512  if(KeyType == atStr){
2513  KeyS = TS.StrColMaps[KeyColIdxS][RelevantRows[i]];
2514  } else{
2515  KeyS = TS.IntCols[KeyColIdxS][RelevantRows[i]];
2516  }
2517  // create a pair of keys - serves as a key in Counters
2518  TIntPr Keys = ThisIsSmaller ? TIntPr(KeyS, KeyB) : TIntPr(KeyB, KeyS);
2519  if(Counters.IsKey(Keys)){
2520  // if the key pair has been seen before - increment its counter by 1
2521  TIntTr& V = Counters.GetDat(Keys);
2522  V.Val3 = V.Val3 + 1;
2523  } else{
2524  // if the key pair hasn't been seen before - add it with value of
2525  // row indices that create a joint record with this key pair
2526  if(ThisIsSmaller){
2527  Counters.AddDat(Keys, TIntTr(RelevantRows[i], RowI.GetRowIdx(),1));
2528  } else{
2529  Counters.AddDat(Keys, TIntTr(RowI.GetRowIdx(), RelevantRows[i],1));
2530  }
2531  }
2532  } // end of for loop
2533  } // end of if statement
2534  } // end of for loop
2535 }
2538  const TIntIntVH& T, TInt JoinColIdxB, TInt KeyColIdxB, TInt KeyColIdxS,
2539  THash<TIntTr,TIntTr>& Counters, TBool ThisIsSmaller, TAttrType JoinColType, TAttrType KeyType){
2540  for (TRowIterator RowI = TB.BegRI(); RowI < TB.EndRI(); RowI++) {
2541  // value to join on from big table
2542  TInt JVal = 0;
2543  if(JoinColType == atStr){
2544  JVal = RowI.GetStrMapById(JoinColIdxB);
2545  } else{
2546  JVal = RowI.GetIntAttr(JoinColIdxB);
2547  }
2548  //printf("JVal: %d\n", JVal.Val);
2549  if(T.IsKey(JVal)){
2550  // read key attribute of big table row
2551  TInt KeyB = 0;
2552  if(KeyType == atStr){
2553  KeyB = RowI.GetStrMapById(KeyColIdxB);
2554  } else{
2555  KeyB = RowI.GetIntAttr(KeyColIdxB);
2556  }
2557  // read row ids from small table with join attribute value of JVal
2558  const TIntV& RelevantRows = T.GetDat(JVal);
2559  for(int i = 0; i < RelevantRows.Len(); i++){
2560  // read key attribute of relevant row from small table
2561  TInt KeyS = 0;
2562  if(KeyType == atStr){
2563  KeyS = TS.StrColMaps[KeyColIdxS][RelevantRows[i]];
2564  } else{
2565  KeyS = TS.IntCols[KeyColIdxS][RelevantRows[i]];
2566  }
2567  // create a pair of keys - serves as a key in Counters
2568  TIntPr Keys = ThisIsSmaller ? TIntPr(KeyS, KeyB) : TIntPr(KeyB, KeyS);
2569  TIntTr K(Keys.Val1,Keys.Val2,JVal);
2570  if(Counters.IsKey(K)){
2571  // if the key pair has been seen before - increment its counter by 1
2572  TIntTr& V = Counters.GetDat(K);
2573  V.Val3 = V.Val3 + 1;
2574  } else{
2575  // if the key pair hasn't been seen before - add it with value of
2576  // row indices that create a joint record with this key pair
2577  if(ThisIsSmaller){
2578  Counters.AddDat(K, TIntTr(RelevantRows[i], RowI.GetRowIdx(),1));
2579  } else{
2580  Counters.AddDat(K, TIntTr(RowI.GetRowIdx(), RelevantRows[i],1));
2581  }
2582  }
2583  } // end of for loop
2584  } // end of if statement
2585  } // end of for loop
2586  }
2588 PTable TTable::ThresholdJoinOutputTable(const THash<TIntPr,TIntTr>& Counters, TInt Threshold, const TTable& Table){
2589  // initialize result table
2590  PTable JointTable = InitializeJointTable(Table);
2591  for(THash<TIntPr,TIntTr>::TIter iter = Counters.BegI(); iter < Counters.EndI(); iter++){
2592  TIntTr& Counter = iter.GetDat();
2593  //printf("keys: %d, %d\n", iter.GetKey().Val1.Val, iter.GetKey().Val2.Val);
2594  //printf("selected rows: %d,%d, counter: %d\n", Counter.Val1.Val, Counter.Val2.Val, Counter.Val3.Val);
2595  if(Counter.Val3 >= Threshold){
2596  JointTable->AddJointRow(*this, Table, Counter.Val1, Counter.Val2);
2597  }
2598  }
2599  return JointTable;
2600 }
2603  PTable JointTable = InitializeJointTable(Table);
2604  for(THash<TIntTr,TIntTr>::TIter iter = Counters.BegI(); iter < Counters.EndI(); iter++){
2605  const TIntTr& Counter = iter.GetDat();
2606  const TIntTr& Keys = iter.GetKey();
2607  THashSet<TIntPr> Pairs;
2608  if(Counter.Val3 >= Threshold){
2609  TIntPr K(Keys.Val1,Keys.Val2);
2610  if(!Pairs.IsKey(K)){
2611  Pairs.AddKey(K);
2612  JointTable->AddJointRow(*this, Table, Counter.Val1, Counter.Val2);
2613  }
2614  }
2615  }
2616  return JointTable;
2617 }
2620 // expected output: one joint tuple (R1,R2) with:
2621 // (1) R1[KeyCol1] = K1 and R2[KeyCol2] = K2
2622 // for every pair of keys (K1,K2) such that the number of joint tuples
2623 // (joint on R1[JoinCol1] = R2[JointCol2]) that hold property (1) is at least Threshold
2624 PTable TTable::ThresholdJoin(const TStr& KeyCol1, const TStr& JoinCol1, const TTable& Table,
2625  const TStr& KeyCol2, const TStr& JoinCol2, TInt Threshold, TBool PerJoinKey){
2626  // test input correctness
2627  ThresholdJoinInputCorrectness(KeyCol1, JoinCol1, Table, KeyCol2, JoinCol2);
2628  //printf("verified input correctness\n");
2629  // type of column on which we join (currently support only int)
2630  TAttrType JoinColType = GetColType(JoinCol1);
2631  // type of key column (currently support only int)
2632  TAttrType KeyType = GetColType(KeyCol1);
2633  // Determine which table is smaller
2634  TBool ThisIsSmaller = (NumValidRows <= Table.NumValidRows);
2635  const TTable& TS = ThisIsSmaller ? *this : Table;
2636  const TTable& TB = ThisIsSmaller ? Table : *this;
2637  TStr JoinColS = JoinCol1;
2638  TInt JoinColIdxB = GetColIdx(JoinCol2);
2639  TInt KeyColIdxS = GetColIdx(KeyCol1);
2640  TInt KeyColIdxB = GetColIdx(KeyCol2);
2641  if(!ThisIsSmaller){
2642  JoinColS = JoinCol2;
2643  JoinColIdxB = GetColIdx(JoinCol1);
2644  KeyColIdxS = GetColIdx(KeyCol2);
2645  KeyColIdxB = GetColIdx(KeyCol1);
2646  }
2648  // debug print
2649  //printf("JoinColS = %d, JoinColIdxB = %d, KeyColIdxS = %d, KeyColIdxB = %d\n",
2650  //GetColIdx(JoinColS).Val, JoinColIdxB.Val, KeyColIdxS.Val, KeyColIdxB.Val);
2651  //printf("starting switch-case\n");
2653  if(KeyType != atInt && KeyType != atStr){
2654  printf("ThresholdJoin only supports integer or string key attributes\n");
2655  TExcept::Throw("ThresholdJoin only supports integer or string key attributes");
2656  }
2657  if(JoinColType != atInt && JoinColType != atStr){
2658  printf("ThresholdJoin only supports integer or string join attributes\n");
2659  TExcept::Throw("ThresholdJoin only supports integer or string join attributes");
2660  }
2661  //printf("starting the real stuff!\n");
2662  // hash the smaller table T: join col value --> physical row ids of rows with that value
2663  TIntIntVH T;
2664  if(JoinColType == atInt){
2665  TS.GroupByIntCol(JoinColS, T, TIntV(), true);
2666  } else if(JoinColType == atStr){
2667  TS.GroupByStrCol(JoinColS, T, TIntV(), true);
2668  } else{
2669  TExcept::Throw("ThresholdJoin only supports integer or string join attributes");
2670  }
2672  /*
2673  for(THash<TInt,TIntV>::TIter it = T.BegI(); it < T.EndI(); it++){
2674  if(JoinColType == atStr){
2675  printf("%s -->", Context.StringVals.GetKey(it.GetKey().Val));
2676  } else{
2677  printf("%d -->", it.GetKey().Val);
2678  }
2679  const TIntV& V = it.GetDat();
2680  for(int sr = 0; sr < V.Len(); sr++){
2681  printf(" %d", V[sr].Val);
2682  }
2683  printf("\n");
2684  }
2685  */
2687  // Counters: (K1,K2) --> (RowIdx1,RowIdx2, count) where K1 is a key from KeyCol1,
2688  // K2 is a key from Table's KeyCol2; RowIdx1 and RowIdx2 are physical row ids
2689  // that participates in a joint tuple that satisfies (1).
2690  // count is the count of joint records that satisfy (1).
2691  // In case of string attributes - the integer mappings of the key attribute values are used.
2692  if(PerJoinKey){
2693  //printf("PerJoinKey\n");
2694  THash<TIntTr,TIntTr> Counters;
2695  ThresholdJoinCountPerJoinKeyCollisions(TB, TS, T, JoinColIdxB, KeyColIdxB, KeyColIdxS, Counters, ThisIsSmaller, JoinColType, KeyType);
2696  /*
2697  for(THash<TIntTr,TIntTr>::TIter it = Counters.BegI(); it < Counters.EndI(); it++){
2698  const TIntTr& K = it.GetKey();
2699  const TIntTr& V = it.GetDat();
2700  if(KeyType == atStr){
2701  printf("%s %s --> %d %d %d\n", Context->StringVals.GetKey(K.Val1), Context->StringVals.GetKey(K.Val2), V.Val1.Val, V.Val2.Val, V.Val3.Val);
2702  } else{
2703  printf("%d %d --> %d %d %d\n", K.Val1.Val, K.Val2.Val, V.Val1.Val, V.Val2.Val, V.Val3.Val);
2704  }
2705  }
2706  */
2707  //printf("found collisions\n");
2708  return ThresholdJoinPerJoinKeyOutputTable(Counters, Threshold, Table);
2709  } else{
2710  //printf("not PerJoinKey\n");
2711  THash<TIntPr,TIntTr> Counters;
2712  ThresholdJoinCountCollisions(TB, TS, T, JoinColIdxB, KeyColIdxB, KeyColIdxS, Counters, ThisIsSmaller, JoinColType, KeyType);
2713  /*
2714  for(THash<TIntPr,TIntTr>::TIter it = Counters.BegI(); it < Counters.EndI(); it++){
2715  const TIntPr& K = it.GetKey();
2716  const TIntTr& V = it.GetDat();
2717  if(KeyType == atStr){
2718  printf("%s %s --> %d %d %d\n", Context->StringVals.GetKey(K.Val1), Context->StringVals.GetKey(K.Val2), V.Val1.Val, V.Val2.Val, V.Val3.Val);
2719  } else{
2720  printf("%d %d --> %d %d %d\n", K.Val1.Val, K.Val2.Val, V.Val1.Val, V.Val2.Val, V.Val3.Val);
2721  }
2722  }
2723  */
2724  //printf("found collisions\n");
2725  return ThresholdJoinOutputTable(Counters, Threshold, Table);
2726  }
2727 }
2730 void TTable::Select(TPredicate& Predicate, TIntV& SelectedRows, TBool Remove) {
2731  TIntV Selected;
2732  TStrV RelevantCols;
2733  Predicate.GetVariables(RelevantCols);
2734  TInt NumRelevantCols = RelevantCols.Len();
2735  TVec<TAttrType> ColTypes = TVec<TAttrType>(NumRelevantCols);
2736  TIntV ColIndices = TIntV(NumRelevantCols);
2737  for (TInt i = 0; i < NumRelevantCols; i++) {
2738  ColTypes[i] = GetColType(RelevantCols[i]);
2739  ColIndices[i] = GetColIdx(RelevantCols[i]);
2740  }
2742  if (Remove) {
2744  while (RowI.GetNextRowIdx() != Last) {
2745  // prepare arguments for predicate evaluation
2746  for (TInt i = 0; i < NumRelevantCols; i++) {
2747  switch (ColTypes[i]) {
2748  case atInt:
2749  Predicate.SetIntVal(RelevantCols[i], RowI.GetNextIntAttr(ColIndices[i]));
2750  break;
2751  case atFlt:
2752  Predicate.SetFltVal(RelevantCols[i], RowI.GetNextFltAttr(ColIndices[i]));
2753  break;
2754  case atStr:
2755  Predicate.SetStrVal(RelevantCols[i], RowI.GetNextStrAttr(ColIndices[i]));
2756  break;
2757  }
2758  }
2759  if (!Predicate.Eval()) {
2760  RowI.RemoveNext();
2761  } else {
2762  RowI++;
2763  }
2764  }
2765  } else {
2766  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
2767  for (TInt i = 0; i < NumRelevantCols; i++) {
2768  switch (ColTypes[i]) {
2769  case atInt:
2770  Predicate.SetIntVal(RelevantCols[i], RowI.GetIntAttr(RelevantCols[i]));
2771  break;
2772  case atFlt:
2773  Predicate.SetFltVal(RelevantCols[i], RowI.GetFltAttr(RelevantCols[i]));
2774  break;
2775  case atStr:
2776  Predicate.SetStrVal(RelevantCols[i], RowI.GetStrAttr(RelevantCols[i]));
2777  break;
2778  }
2779  }
2780  if (Predicate.Eval()) { SelectedRows.Add(RowI.GetRowIdx()); }
2781  }
2782  }
2783 }
2785 void TTable::Classify(TPredicate& Predicate, const TStr& LabelName, const TInt& PositiveLabel, const TInt& NegativeLabel) {
2786  TIntV SelectedRows;
2787  Select(Predicate, SelectedRows, false);
2788  ClassifyAux(SelectedRows, LabelName, PositiveLabel, NegativeLabel);
2789 }
2792 // Further optimization: both comparison operation and type of columns don't change between rows..
2793 void TTable::SelectAtomic(const TStr& Col1, const TStr& Col2, TPredComp Cmp, TIntV& SelectedRows, TBool Remove) {
2794  const TAttrType Ty1 = GetColType(Col1);
2795  const TAttrType Ty2 = GetColType(Col2);
2796  const TInt ColIdx1 = GetColIdx(Col1);
2797  const TInt ColIdx2 = GetColIdx(Col2);
2798  if (Ty1 != Ty2) {
2799  TExcept::Throw("SelectAtomic: diff types");
2800  }
2801  if (Cmp == SUBSTR || Cmp == SUPERSTR) { Assert(Ty1 == atStr); }
2803  if (Remove) {
2805  while (RowI.GetNextRowIdx() != Last) {
2807  TBool Result;
2808  switch (Ty1) {
2809  case atInt:
2810  Result = TPredicate::EvalAtom(RowI.GetNextIntAttr(ColIdx1), RowI.GetNextIntAttr(ColIdx2), Cmp);
2811  break;
2812  case atFlt:
2813  Result = TPredicate::EvalAtom(RowI.GetNextFltAttr(ColIdx1), RowI.GetNextFltAttr(ColIdx2), Cmp);
2814  break;
2815  case atStr:
2816  Result = TPredicate::EvalStrAtom(RowI.GetNextStrAttr(ColIdx1), RowI.GetNextStrAttr(ColIdx2), Cmp);
2817  break;
2818  }
2820  if (!Result) {
2821  RowI.RemoveNext();
2822  } else {
2823  RowI++;
2824  }
2826  }
2827  } else {
2828  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
2829  TBool Result;
2830  switch (Ty1) {
2831  case atInt:
2832  Result = TPredicate::EvalAtom(RowI.GetIntAttr(Col1), RowI.GetIntAttr(Col2), Cmp);
2833  break;
2834  case atFlt:
2835  Result = TPredicate::EvalAtom(RowI.GetFltAttr(Col1), RowI.GetFltAttr(Col2), Cmp);
2836  break;
2837  case atStr:
2838  Result = TPredicate::EvalStrAtom(RowI.GetStrAttr(Col1), RowI.GetStrAttr(Col2), Cmp);
2839  break;
2840  }
2841  if (Result) { SelectedRows.Add(RowI.GetRowIdx()); }
2842  }
2843  }
2844 }
2846 void TTable::ClassifyAtomic(const TStr& Col1, const TStr& Col2, TPredComp Cmp,
2847  const TStr& LabelName, const TInt& PositiveLabel, const TInt& NegativeLabel) {
2848  TIntV SelectedRows;
2849  SelectAtomic(Col1, Col2, Cmp, SelectedRows, false);
2850  ClassifyAux(SelectedRows, LabelName, PositiveLabel, NegativeLabel);
2851 }
2854  TIntV& SelectedRows, PTable& SelectedTable, TBool Remove, TBool Table) {
2855  //double startFn = omp_get_wtime();
2856  TStr ValTStr(Val.GetStr());
2857  TAttrType Type = GetColType(Col);
2858  TInt ColIdx = GetColIdx(Col);
2860  if (Type != Val.GetType()) {
2861  TExcept::Throw("SelectAtomicConst: coltype does not match const type");
2862  }
2864  if(Remove){
2865 #ifdef USE_OPENMP
2866  if (GetMP()) {
2867  //double endInit = omp_get_wtime();
2868  //printf("Init time = %f\n", endInit-startFn);
2869  TIntPrV Partitions;
2870  GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
2871  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
2872  int RemoveCount = 0;
2873  //double endPart = omp_get_wtime();
2874  //printf("Partition time = %f\n", endPart-endInit);
2876  TIntPrV Bounds(Partitions.Len());
2878  // #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD) reduction(+:RemoveCount) shared(Val)
2879  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD) reduction(+:RemoveCount)
2880  for (int i = 0; i < Partitions.Len(); i++){
2881  //TPrimitive ThreadLocalVal(Val);
2882  TRowIterator RowI(Partitions[i].GetVal1(), this);
2883  TRowIterator EndI(Partitions[i].GetVal2(), this);
2884  TInt FirstRowIdx = TTable::Invalid;
2885  TInt LastRowIdx = TTable::Invalid;
2886  TBool First = true;
2887  while (RowI < EndI) {
2888  TInt CurrRowIdx = RowI.GetRowIdx();
2889  TBool Result;
2890  if (Type != atStr) {
2891  Result = RowI.CompareAtomicConst(ColIdx, Val, Cmp);
2892  } else {
2893  Result = RowI.CompareAtomicConstTStr(ColIdx, ValTStr, Cmp);
2894  }
2895  RowI++;
2896  if(!Result) {
2897  Next[CurrRowIdx] = TTable::Invalid;
2898  RemoveCount++;
2899  } else {
2900  if (First) { FirstRowIdx = CurrRowIdx; First = false; }
2901  else { Next[LastRowIdx] = CurrRowIdx; }
2902  LastRowIdx = CurrRowIdx;
2903  }
2904  }
2905  Bounds[i] = TIntPr(FirstRowIdx, LastRowIdx);
2906  //printf("Thread %d: i = %d, start = %d, end = %d\n", omp_get_thread_num(), i,
2907  // Partitions[i].GetVal1().Val, Partitions[i].GetVal2().Val);
2908  }
2909  //double endIter = omp_get_wtime();
2910  //printf("Iter time = %f\n", endIter-endPart);
2912  // repair the next vector
2913  TInt CurrBound = 0;
2914  while (CurrBound < Bounds.Len() && Bounds[CurrBound].Val1 == TTable::Invalid) {
2915  CurrBound++;
2916  }
2917  if (CurrBound == Bounds.Len()) {
2918  // selected table is empty
2919  Assert(NumValidRows == RemoveCount);
2920  NumValidRows = 0;
2923  } else {
2924  NumValidRows -= RemoveCount;
2925  FirstValidRow = Bounds[CurrBound].Val1;
2926  LastValidRow = Bounds[CurrBound].Val2;
2927  TInt PrevBound = CurrBound;
2928  CurrBound++;
2929  while (CurrBound < Bounds.Len()) {
2930  if (Bounds[CurrBound].Val1 == TTable::Invalid) { CurrBound++; continue; }
2931  Next[Bounds[PrevBound].Val2] = Bounds[CurrBound].Val1;
2932  LastValidRow = Bounds[CurrBound].Val2;
2933  PrevBound = CurrBound;
2934  CurrBound++;
2935  }
2936  Next[Bounds[PrevBound].Val2] = TTable::Last;
2937  }
2938  IsNextDirty = 1;
2939  //double endRepair = omp_get_wtime();
2940  //printf("Repair time = %f\n", endRepair-endIter);
2941  } else {
2942 #endif
2944  while(RowI.GetNextRowIdx() != Last){
2945  if (!RowI.CompareAtomicConst(ColIdx, Val, Cmp)) {
2946  RowI.RemoveNext();
2947  } else {
2948  RowI++;
2949  }
2950  }
2951  IsNextDirty = 1;
2952 #ifdef USE_OPENMP
2953  }
2954 #endif
2955  } else if (Table) {
2956 #ifdef USE_OPENMP
2957  if (GetMP()) {
2958  //double endInit = omp_get_wtime();
2959  //printf("Init time = %f\n", endInit-startFn);
2960  TIntPrV Partitions;
2961  GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
2962  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
2963  //double endPart = omp_get_wtime();
2964  //printf("Partition time = %f\n", endPart-endInit);
2966  int TotalSelectedRows = 0;
2967  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD) reduction(+:TotalSelectedRows)
2968  for (int i = 0; i < Partitions.Len(); i++){
2969  TRowIterator RowI(Partitions[i].GetVal1(), this);
2970  TRowIterator EndI(Partitions[i].GetVal2(), this);
2971  while (RowI < EndI) {
2972  if (Type != atStr) {
2973  if (RowI.CompareAtomicConst(ColIdx, Val, Cmp)) {
2974  TotalSelectedRows++;
2975  }
2976  } else {
2977  if (RowI.CompareAtomicConstTStr(ColIdx, ValTStr, Cmp)) {
2978  TotalSelectedRows++;
2979  }
2980  }
2981  RowI++;
2982  }
2983  }
2984  //double endCount = omp_get_wtime();
2985  //printf("Count time = %f\n", endCount-endPart);
2987  SelectedTable->ResizeTable(TotalSelectedRows);
2988  //double endResize = omp_get_wtime();
2989  //printf("Resize time = %f\n", endResize-endCount);
2991  if (TotalSelectedRows == 0) {
2992  // printf("Select: Empty output!\n");
2993  return;
2994  }
2996  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD)
2997  for (int i = 0; i < Partitions.Len(); i++){
2998  TIntV LocalSelectedRows;
2999  LocalSelectedRows.Reserve(PartitionSize);
3000  TRowIterator RowI(Partitions[i].GetVal1(), this);
3001  TRowIterator EndI(Partitions[i].GetVal2(), this);
3002  while (RowI < EndI) {
3003  if (Type != atStr) {
3004  if (RowI.CompareAtomicConst(ColIdx, Val, Cmp)) {
3005  LocalSelectedRows.Add(RowI.GetRowIdx());
3006  }
3007  } else {
3008  if (RowI.CompareAtomicConstTStr(ColIdx, ValTStr, Cmp)) {
3009  LocalSelectedRows.Add(RowI.GetRowIdx());
3010  }
3011  }
3012  RowI++;
3013  }
3014  SelectedTable->AddSelectedRows(*this, LocalSelectedRows);
3015  //printf("Thread %d: i = %d, start = %d, end = %d\n", omp_get_thread_num(), i,
3016  // Partitions[i].GetVal1().Val, Partitions[i].GetVal2().Val);
3017  }
3018  //double endIter = omp_get_wtime();
3019  //printf("Iter time = %f\n", endIter-endResize);
3021  //SelectedTable->ResizeTable(SelectedTable->GetNumValidRows());
3022  //double endResize2 = omp_get_wtime();
3023  //printf("Resize2 time = %f\n", endResize2-endIter);
3024  SelectedTable->SetFirstValidRow();
3025  } else {
3026 #endif
3027  for(TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++){
3028  if (RowI.CompareAtomicConst(ColIdx, Val, Cmp)) {
3029  SelectedTable->AddRow(RowI);
3030  }
3031  }
3032 #ifdef USE_OPENMP
3033  }
3034 #endif
3035  } else {
3036  for(TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++){
3037  if (RowI.CompareAtomicConst(ColIdx, Val, Cmp)) {
3038  SelectedRows.Add(RowI.GetRowIdx());
3039  }
3040  }
3041  }
3042 }
3044 inline TInt TTable::CompareRows(TInt R1, TInt R2, const TAttrType& CompareByType, const TInt& CompareByIndex, TBool Asc) {
3045  //printf("comparing rows %d %d by %s\n", R1.Val, R2.Val, CompareBy.CStr());
3046  switch (CompareByType) {
3047  case atInt:{
3048  if (IntCols[CompareByIndex][R1] > IntCols[CompareByIndex][R2]) { return (Asc ? 1 : -1); }
3049  if (IntCols[CompareByIndex][R1] < IntCols[CompareByIndex][R2]) { return (Asc ? -1 : 1); }
3050  return 0;
3051  }
3052  case atFlt:{
3053  if (FltCols[CompareByIndex][R1] > FltCols[CompareByIndex][R2]) { return (Asc ? 1 : -1); }
3054  if (FltCols[CompareByIndex][R1] < FltCols[CompareByIndex][R2]) { return (Asc ? -1 : 1); }
3055  return 0;
3056  }
3057  case atStr:{
3058  TStr S1 = GetStrVal(CompareByIndex, R1);
3059  TStr S2 = GetStrVal(CompareByIndex, R2);
3060  int CmpRes = strcmp(S1.CStr(), S2.CStr());
3061  return (Asc ? CmpRes : -CmpRes);
3062  }
3063  }
3064  // code should not come here, added to remove a compiler warning
3065  return 0;
3066 }
3068 inline TInt TTable::CompareRows(TInt R1, TInt R2, const TVec<TAttrType>& CompareByTypes, const TIntV& CompareByIndices, TBool Asc) {
3069  for (TInt i = 0; i < CompareByTypes.Len(); i++) {
3070  TInt res = CompareRows(R1, R2, CompareByTypes[i], CompareByIndices[i], Asc);
3071  if (res != 0) { return res; }
3072  }
3073  return 0;
3074 }
3076 void TTable::ISort(TIntV& V, TInt StartIdx, TInt EndIdx, const TVec<TAttrType>& SortByTypes, const TIntV& SortByIndices, TBool Asc) {
3077  if (StartIdx < EndIdx) {
3078  for (TInt i = StartIdx+1; i <= EndIdx; i++) {
3079  TInt Val = V[i];
3080  TInt j = i;
3081  while ((StartIdx < j) && (CompareRows(V[j-1], Val, SortByTypes, SortByIndices, Asc) > 0)) {
3082  V[j] = V[j-1];
3083  j--;
3084  }
3085  V[j] = Val;
3086  }
3087  }
3088 }
3090 TInt TTable::GetPivot(TIntV& V, TInt StartIdx, TInt EndIdx, const TVec<TAttrType>& SortByTypes, const TIntV& SortByIndices, TBool Asc) {
3091  TInt L = EndIdx - StartIdx + 1;
3092  const TInt Idx1 = StartIdx + TInt::GetRnd(L);
3093  const TInt Idx2 = StartIdx + TInt::GetRnd(L);
3094  const TInt Idx3 = StartIdx + TInt::GetRnd(L);
3095  if (CompareRows(V[Idx1], V[Idx2], SortByTypes, SortByIndices, Asc) < 0) {
3096  if (CompareRows(V[Idx2], V[Idx3], SortByTypes, SortByIndices, Asc) < 0) { return Idx2; }
3097  if (CompareRows(V[Idx1], V[Idx3], SortByTypes, SortByIndices, Asc) < 0) { return Idx3; }
3098  return Idx1;
3099  } else {
3100  if (CompareRows(V[Idx3], V[Idx2], SortByTypes, SortByIndices, Asc) < 0) { return Idx2; }
3101  if (CompareRows(V[Idx3], V[Idx1], SortByTypes, SortByIndices, Asc) < 0) { return Idx3; }
3102  return Idx1;
3103  }
3104 }
3106 TInt TTable::Partition(TIntV& V, TInt StartIdx, TInt EndIdx, const TVec<TAttrType>& SortByTypes, const TIntV& SortByIndices, TBool Asc) {
3108  // test if the elements are already sorted
3109  TInt j;
3110  for (j = StartIdx; j < EndIdx; j++) {
3111  if (CompareRows(V[j], V[j+1], SortByTypes, SortByIndices, Asc) > 0) {
3112  break;
3113  }
3114  }
3115  if (j >= EndIdx) {
3116  return EndIdx+1;
3117  }
3119  TInt PivotIdx = GetPivot(V, StartIdx, EndIdx, SortByTypes, SortByIndices, Asc);
3120  TInt Pivot = V[PivotIdx];
3121  V.Swap(PivotIdx, EndIdx);
3122  TInt StoreIdx = StartIdx;
3123  for (TInt i = StartIdx; i < EndIdx; i++) {
3124  if (CompareRows(V[i], Pivot, SortByTypes, SortByIndices, Asc) <= 0) {
3125  V.Swap(i, StoreIdx);
3126  StoreIdx++;
3127  }
3128  }
3129  // move pivot value to its place
3130  V.Swap(StoreIdx, EndIdx);
3131  return StoreIdx;
3132 }
3134 void TTable::QSort(TIntV& V, TInt StartIdx, TInt EndIdx, const TVec<TAttrType>& SortByTypes, const TIntV& SortByIndices, TBool Asc) {
3135  if (StartIdx < EndIdx) {
3136  if (EndIdx - StartIdx < 20) {
3137  ISort(V, StartIdx, EndIdx, SortByTypes, SortByIndices, Asc);
3138  } else {
3139  TInt Pivot = Partition(V, StartIdx, EndIdx, SortByTypes, SortByIndices, Asc);
3140  if (Pivot > EndIdx) {
3141  return;
3142  }
3143  // Everything <= Pivot will be in StartIdx, Pivot-1. Shrink this
3144  // range to ignore elements equal to the pivot in the first
3145  // recursive call, to optimize for the case when a lot of
3146  // rows are equal.
3147  int Ub = Pivot - 1;
3148  while (Ub >= StartIdx && CompareRows(
3149  V[Ub], V[Pivot], SortByTypes, SortByIndices, Asc) == 0) {
3150  Ub -= 1;
3151  }
3152  QSort(V, StartIdx, Ub, SortByTypes, SortByIndices, Asc);
3153  QSort(V, Pivot+1, EndIdx, SortByTypes, SortByIndices, Asc);
3154  }
3155  }
3156 }
3158 void TTable::Merge(TIntV& V, TInt Idx1, TInt Idx2, TInt Idx3, const TVec<TAttrType>& SortByTypes, const TIntV& SortByIndices, TBool Asc) {
3159  TInt i = Idx1, j = Idx2;
3160  TIntV SortedV;
3161  while (i < Idx2 && j < Idx3) {
3162  if (CompareRows(V[i], V[j], SortByTypes, SortByIndices, Asc) <= 0) {
3163  SortedV.Add(V[i]);
3164  i++;
3165  }
3166  else {
3167  SortedV.Add(V[j]);
3168  j++;
3169  }
3170  }
3171  while (i < Idx2) {
3172  SortedV.Add(V[i]);
3173  i++;
3174  }
3175  while (j < Idx3) {
3176  SortedV.Add(V[j]);
3177  j++;
3178  }
3180  for (TInt sz = 0; sz < Idx3 - Idx1; sz++) {
3181  V[Idx1 + sz] = SortedV[sz];
3182  }
3183 }
3185 #ifdef USE_OPENMP
3186 void TTable::QSortPar(TIntV& V, const TVec<TAttrType>& SortByTypes, const TIntV& SortByIndices, TBool Asc) {
3187  TInt NumThreads = 8; // Setting this to 8 because that results in the fastest sorting on Madmax.
3188  TInt Sz = V.Len();
3189  TIntV IndV, NextV;
3190  for (TInt i = 0; i < NumThreads; i++) {
3191  IndV.Add(i * (Sz / NumThreads));
3192  }
3193  IndV.Add(Sz);
3195  omp_set_num_threads(NumThreads);
3196  #pragma omp parallel for
3197  for (int i = 0; i < NumThreads; i++) {
3198  QSort(V, IndV[i], IndV[i+1] - 1, SortByTypes, SortByIndices, Asc);
3199  }
3201  while (NumThreads > 1) {
3202  omp_set_num_threads(NumThreads / 2);
3203  #pragma omp parallel for
3204  for (int i = 0; i < NumThreads; i += 2) {
3205  Merge(V, IndV[i], IndV[i+1], IndV[i+2], SortByTypes, SortByIndices, Asc);
3206  }
3208  NextV.Clr();
3209  for (TInt i = 0; i < NumThreads; i+=2) {
3210  NextV.Add(IndV[i]);
3211  }
3212  NextV.Add(Sz);
3213  IndV = NextV;
3215  NumThreads = NumThreads / 2;
3216  }
3217 }
3218 #endif // USE_OPENMP
3220 void TTable::Order(const TStrV& OrderBy, TStr OrderColName, TBool ResetRankByMSC, TBool Asc) {
3221  // get a vector of all valid row indices
3222  TIntV ValidRows = TIntV(NumValidRows);
3223  if (NumRows == NumValidRows) {
3224  for (TInt i = 0; i < NumValidRows; i++) {
3225  ValidRows[i] = i;
3226  }
3227  } else {
3228  TInt i = 0;
3229  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
3230  ValidRows[i] = RI.GetRowIdx();
3231  i++;
3232  }
3233  }
3234  TVec<TAttrType> OrderByTypes(OrderBy.Len());
3235  TIntV OrderByIndices(OrderBy.Len());
3236  for (TInt i = 0; i < OrderBy.Len(); i++) {
3237  OrderByTypes[i] = GetColType(OrderBy[i]);
3238  OrderByIndices[i] = GetColIdx(OrderBy[i]);
3239  }
3241  // sort that vector according to the attributes given in "OrderBy" in lexicographic order
3242 #ifdef USE_OPENMP
3243  if (GetMP()) {
3244  QSortPar(ValidRows, OrderByTypes, OrderByIndices, Asc);
3245  } else {
3246 #endif
3247  QSort(ValidRows, 0, NumValidRows-1, OrderByTypes, OrderByIndices, Asc);
3248 #ifdef USE_OPENMP
3249  }
3250 #endif
3252  // rewire Next vector
3253  IsNextDirty = 1;
3254  if (NumValidRows > 0) {
3255  FirstValidRow = ValidRows[0];
3256  } else {
3257  FirstValidRow = Last;
3258  }
3259  for (TInt i = 0; i < NumValidRows-1; i++) {
3260  Next[ValidRows[i]] = ValidRows[i+1];
3261  }
3262  if (NumValidRows > 0) {
3263  Next[ValidRows[NumValidRows-1]] = Last;
3264  LastValidRow = ValidRows[NumValidRows-1];
3265  } else {
3266  LastValidRow = Last;
3267  }
3269  // add rank column
3270  if (!OrderColName.Empty()) {
3271  TIntV RankCol = TIntV(NumRows);
3272  for (TInt i = 0; i < NumValidRows; i++) {
3273  RankCol[ValidRows[i]] = i;
3274  }
3275  if (ResetRankByMSC) {
3276  for (TInt i = 1; i < NumValidRows; i++) {
3277  TStr GroupName = OrderBy[0];
3278  if (GetStrVal(GroupName, ValidRows[i]) != GetStrVal(GroupName, ValidRows[i-1])) {
3279  RankCol[ValidRows[i]] = 0;
3280  } else {
3281  RankCol[ValidRows[i]] = RankCol[ValidRows[i-1]] + 1;
3282  }
3283  }
3284  }
3285  IntCols.Add(RankCol);
3286  AddSchemaCol(OrderColName, atInt);
3287  AddColType(OrderColName, atInt, IntCols.Len()-1);
3288  }
3289 }
3292  TInt FreeIndex = 0;
3293  TIntV Mapping; // Mapping[old_index] = new_index/invalid
3295  TInt IdColIdx = GetColIdx(IdColName);
3297  for (TInt i = 0; i < Next.Len(); i++) {
3298  if (Next[i] != TTable::Invalid) {
3299  // "first row" properly set beforehand
3300  if (FreeIndex == 0) {
3301  Assert (i == FirstValidRow);
3302  FirstValidRow = 0;
3303  }
3305  if (Next[i] != Last) {
3306  Next[FreeIndex] = FreeIndex + 1;
3307  Mapping.Add(FreeIndex);
3308  } else {
3309  Next[FreeIndex] = Last;
3310  LastValidRow = FreeIndex;
3311  Mapping.Add(Last);
3312  }
3314  RowIdMap.AddDat(IntCols[IdColIdx][i], FreeIndex);
3316  for (TInt j = 0; j < IntCols.Len(); j++) {
3317  IntCols[j][FreeIndex] = IntCols[j][i];
3318  }
3319  for (TInt j = 0; j < FltCols.Len(); j++) {
3320  FltCols[j][FreeIndex] = FltCols[j][i];
3321  }
3322  for (TInt j = 0; j < StrColMaps.Len(); j++) {
3323  StrColMaps[j][FreeIndex] = StrColMaps[j][i];
3324  }
3326  FreeIndex++;
3327  } else {
3328  NumRows--;
3329  Mapping.Add(TTable::Invalid);
3330  }
3331  }
3333  // should match, or bug somewhere
3335 }
3338  if (N == 0) {
3339  LastValidRow = -1;
3340  return;
3341  }
3342  TRowIterator RowI = BegRI();
3343  TInt count = 1;
3344  while (count < N) {
3345  if (!(RowI < EndRI())) {
3346  return; // The table contains less than N rows
3347  }
3348  RowI++;
3349  count++;
3350  }
3351  NumValidRows = N;
3352  TInt LastId = RowI.GetRowIdx();
3353  if (Next[LastId] == Last) {
3354  return; // The table contains exactly N rows
3355  }
3356  // The table contains more than N rows
3357  TInt CurrId = LastId;
3358  while (Next[CurrId] != Last) {
3359  Assert(Next[CurrId] != Invalid);
3360  TInt NextId = Next[CurrId];
3361  Next[CurrId] = Invalid;
3362  CurrId = NextId;
3363  }
3364  Next[LastId] = Last;
3365  LastValidRow = LastId;
3366 }
3368 inline void TTable::CheckAndAddIntNode(PNEANet Graph, THashSet<TInt>& NodeVals, TInt NodeId) {
3369  if (!NodeVals.IsKey(NodeId)) {
3370  Graph->AddNode(NodeId);
3371  NodeVals.AddKey(NodeId);
3372  }
3373 }
3375 inline void TTable::AddEdgeAttributes(PNEANet& Graph, int RowId) {
3376  for (TInt i = 0; i < EdgeAttrV.Len(); i++) {
3377  TStr ColName = EdgeAttrV[i];
3378  TAttrType T = GetColType(ColName);
3379  TInt Index = GetColIdx(ColName);
3380  switch (T) {
3381  case atInt:
3382  Graph->AddIntAttrDatE(RowId, IntCols[Index][RowId], ColName);
3383  break;
3384  case atFlt:
3385  Graph->AddFltAttrDatE(RowId, FltCols[Index][RowId], ColName);
3386  break;
3387  case atStr:
3388  Graph->AddStrAttrDatE(RowId, GetStrVal(Index, RowId), ColName);
3389  break;
3390  }
3391  }
3392 }
3394 inline void TTable::AddNodeAttributes(TInt NId, TStrV NodeAttrV, TInt RowId, THash<TInt, TStrIntVH>& NodeIntAttrs,
3395  THash<TInt, TStrFltVH>& NodeFltAttrs, THash<TInt, TStrStrVH>& NodeStrAttrs) {
3396  for (TInt i = 0; i < NodeAttrV.Len(); i++) {
3397  TStr ColAttr = NodeAttrV[i];
3398  TAttrType CT = GetColType(ColAttr);
3399  int ColId = GetColIdx(ColAttr);
3400  // check if this is a common src-dst attribute
3401  for (TInt i = 0; i < CommonNodeAttrs.Len(); i++) {
3402  if (CommonNodeAttrs[i].Val1 == ColAttr || CommonNodeAttrs[i].Val2 == ColAttr) {
3403  ColAttr = CommonNodeAttrs[i].Val3;
3404  break;
3405  }
3406  }
3407  if (CT == atInt) {
3408  if (!NodeIntAttrs.IsKey(NId)) { NodeIntAttrs.AddKey(NId); }
3409  if (!NodeIntAttrs.GetDat(NId).IsKey(ColAttr)) { NodeIntAttrs.GetDat(NId).AddKey(ColAttr); }
3410  NodeIntAttrs.GetDat(NId).GetDat(ColAttr).Add(IntCols[ColId][RowId]);
3411  } else if (CT == atFlt) {
3412  if (!NodeFltAttrs.IsKey(NId)) { NodeFltAttrs.AddKey(NId); }
3413  if (!NodeFltAttrs.GetDat(NId).IsKey(ColAttr)) { NodeFltAttrs.GetDat(NId).AddKey(ColAttr); }
3414  NodeFltAttrs.GetDat(NId).GetDat(ColAttr).Add(FltCols[ColId][RowId]);
3415  } else {
3416  if (!NodeStrAttrs.IsKey(NId)) { NodeStrAttrs.AddKey(NId); }
3417  if (!NodeStrAttrs.GetDat(NId).IsKey(ColAttr)) { NodeStrAttrs.GetDat(NId).AddKey(ColAttr); }
3418  NodeStrAttrs.GetDat(NId).GetDat(ColAttr).Add(GetStrVal(ColId, RowId));
3419  }
3420  }
3421 }
3423 // Makes one pass over all the rows in the vector RowIds, and builds
3424 // a PNEANet, with each row as an edge between SrcCol and DstCol.
3425 PNEANet TTable::BuildGraph(const TIntV& RowIds, TAttrAggr AggrPolicy) {
3426  PNEANet Graph = TNEANet::New();
3428  const TAttrType NodeType = GetColType(SrcCol);
3429  Assert(NodeType == GetColType(DstCol));
3430  const TInt SrcColIdx = GetColIdx(SrcCol);
3431  const TInt DstColIdx = GetColIdx(DstCol);
3433  // node values - i.e. the unique values of src/dst col
3434  //THashSet<TInt> IntNodeVals; // for both int and string node attr types.
3435  THash<TFlt, TInt> FltNodeVals;
3437  // node attributes
3438  THash<TInt, TStrIntVH> NodeIntAttrs;
3439  THash<TInt, TStrFltVH> NodeFltAttrs;
3440  THash<TInt, TStrStrVH> NodeStrAttrs;
3442  // make single pass over all rows in given row id set
3443  for (TVec<TInt>::TIter it = RowIds.BegI(); it < RowIds.EndI(); it++) {
3444  TInt CurrRowIdx = *it;
3446  // add src and dst nodes to graph if they are not seen earlier
3447  TInt SVal, DVal;
3448  if (NodeType == atFlt) {
3449  TFlt FSVal = FltCols[SrcColIdx][CurrRowIdx];
3450  SVal = CheckAndAddFltNode(Graph, FltNodeVals, FSVal);
3451  TFlt FDVal = FltCols[SrcColIdx][CurrRowIdx];
3452  DVal = CheckAndAddFltNode(Graph, FltNodeVals, FDVal);
3453  } else if (NodeType == atInt || NodeType == atStr) {
3454  if (NodeType == atInt) {
3455  SVal = IntCols[SrcColIdx][CurrRowIdx];
3456  DVal = IntCols[DstColIdx][CurrRowIdx];
3457  } else {
3458  SVal = StrColMaps[SrcColIdx][CurrRowIdx];
3459  if (strlen(Context->StringVals.GetKey(SVal)) == 0) { continue; } //illegal value
3460  DVal = StrColMaps[DstColIdx][CurrRowIdx];
3461  if (strlen(Context->StringVals.GetKey(DVal)) == 0) { continue; } //illegal value
3462  }
3463  if (!Graph->IsNode(SVal)) { Graph->AddNode(SVal); }
3464  if (!Graph->IsNode(DVal)) { Graph->AddNode(DVal); }
3465  //CheckAndAddIntNode(Graph, IntNodeVals, SVal);
3466  //CheckAndAddIntNode(Graph, IntNodeVals, DVal);
3467  }
3469  // add edge and edge attributes
3470  Graph->AddEdge(SVal, DVal, CurrRowIdx);
3471  if (EdgeAttrV.Len() > 0) { AddEdgeAttributes(Graph, CurrRowIdx); }
3473  // get src and dst node attributes into hashmaps
3474  if (SrcNodeAttrV.Len() > 0) {
3475  AddNodeAttributes(SVal, SrcNodeAttrV, CurrRowIdx, NodeIntAttrs, NodeFltAttrs, NodeStrAttrs);
3476  }
3477  if (DstNodeAttrV.Len() > 0) {
3478  AddNodeAttributes(DVal, DstNodeAttrV, CurrRowIdx, NodeIntAttrs, NodeFltAttrs, NodeStrAttrs);
3479  }
3480  }
3482  // aggregate node attributes and add to graph
3483  if (SrcNodeAttrV.Len() > 0 || DstNodeAttrV.Len() > 0) {
3484  for (TNEANet::TNodeI NodeI = Graph->BegNI(); NodeI < Graph->EndNI(); NodeI++) {
3485  TInt NId = NodeI.GetId();
3486  if (NodeIntAttrs.IsKey(NId)) {
3487  TStrIntVH IntAttrVals = NodeIntAttrs.GetDat(NId);
3488  for (TStrIntVH::TIter it = IntAttrVals.BegI(); it < IntAttrVals.EndI(); it++) {
3489  TInt AttrVal = AggregateVector<TInt>(it.GetDat(), AggrPolicy);
3490  Graph->AddIntAttrDatN(NId, AttrVal, it.GetKey());
3491  }
3492  }
3493  if (NodeFltAttrs.IsKey(NId)) {
3494  TStrFltVH FltAttrVals = NodeFltAttrs.GetDat(NId);
3495  for (TStrFltVH::TIter it = FltAttrVals.BegI(); it < FltAttrVals.EndI(); it++) {
3496  TFlt AttrVal = AggregateVector<TFlt>(it.GetDat(), AggrPolicy);
3497  Graph->AddFltAttrDatN(NId, AttrVal, it.GetKey());
3498  }
3499  }
3500  if (NodeStrAttrs.IsKey(NId)) {
3501  TStrStrVH StrAttrVals = NodeStrAttrs.GetDat(NId);
3502  for (TStrStrVH::TIter it = StrAttrVals.BegI(); it < StrAttrVals.EndI(); it++) {
3503  TStr AttrVal = AggregateVector<TStr>(it.GetDat(), AggrPolicy);
3504  Graph->AddStrAttrDatN(NId, AttrVal, it.GetKey());
3505  }
3506  }
3507  }
3508  }
3510  return Graph;
3511 }
3515 void TTable::InitRowIdBuckets(int NumBuckets) {
3516  for (TInt i = 0; i < RowIdBuckets.Len(); i++) {
3517  RowIdBuckets[i].Clr();
3518  }
3519  RowIdBuckets.Clr();
3521  RowIdBuckets.Gen(NumBuckets);
3522  for (TInt i = 0; i < NumBuckets; i++) {
3523  RowIdBuckets[i].Gen(10, 0);
3524  }
3525 }
3527 void TTable::FillBucketsByWindow(TStr SplitAttr, TInt JumpSize, TInt WindowSize, TInt StartVal, TInt EndVal) {
3528  Assert (JumpSize <= WindowSize);
3529  int NumBuckets, MinBucket, MaxBucket;
3530  TInt SplitColId = GetColIdx(SplitAttr);
3532  if (StartVal == TInt::Mn || EndVal == TInt::Mx) {
3533  // calculate min and max value of the column 'SplitAttr'
3534  TInt MinValue = TInt::Mx;
3535  TInt MaxValue = TInt::Mn;
3536  for (TInt i = 0; i < Next.Len(); i++) {
3537  if (Next[i] != Invalid) {
3538  if (MinValue > IntCols[SplitColId][i]) {
3539  MinValue = IntCols[SplitColId][i];
3540  }
3541  if (MaxValue < IntCols[SplitColId][i]) {
3542  MaxValue = IntCols[SplitColId][i];
3543  }
3544  }
3545  }
3547  if (StartVal == TInt::Mn) StartVal = MinValue;
3548  if (EndVal == TInt::Mx) EndVal = MaxValue;
3549  }
3551  // initialize buckets
3552  if (JumpSize == 0) { NumBuckets = (EndVal - StartVal)/JumpSize + 1; }
3553  else { NumBuckets = (EndVal - StartVal)/JumpSize + 1; }
3555  InitRowIdBuckets(NumBuckets);
3557  // populate RowIdSets by computing the range of buckets for each row
3558  for (TInt i = 0; i < Next.Len(); i++) {
3559  if (Next[i] == Invalid) { continue; }
3560  int SplitVal = IntCols[SplitColId][i];
3561  if (SplitVal < StartVal || SplitVal > EndVal) { continue; }
3562  int RowVal = SplitVal - StartVal;
3563  if (JumpSize == 0) { // expanding windows
3564  MinBucket = RowVal/WindowSize;
3565  MaxBucket = NumBuckets-1;
3566  } else if (JumpSize == WindowSize) { // disjoint windows
3567  MinBucket = MaxBucket = RowVal/JumpSize;
3568  } else { // sliding windows
3569  if (RowVal < WindowSize) { MinBucket = 0; }
3570  else { MinBucket = (RowVal-WindowSize)/JumpSize + 1; }
3571  MaxBucket = RowVal/JumpSize;
3572  }
3573  for (TInt j = MinBucket; j <= MaxBucket; j++) { RowIdBuckets[j].Add(i); }
3574  }
3575 }
3577 void TTable::FillBucketsByInterval(TStr SplitAttr, TIntPrV SplitIntervals) {
3578  TInt SplitColId = GetColIdx(SplitAttr);
3579  int NumBuckets = SplitIntervals.Len();
3580  InitRowIdBuckets(NumBuckets);
3582  // populate RowIdSets by computing the range of buckets for each row
3583  for (TInt i = 0; i < Next.Len(); i++) {
3584  if (Next[i] == Invalid) { continue; }
3585  int SplitVal = IntCols[SplitColId][i];
3586  for (TInt j = 0; j < SplitIntervals.Len(); j++) {
3587  if (SplitVal >= SplitIntervals[j].Val1 && SplitVal < SplitIntervals[j].Val2) {
3588  RowIdBuckets[j].Add(i);
3589  }
3590  }
3591  }
3592 }
3595  //call BuildGraph on each row id set - parallelizable!
3596  TVec<PNEANet> GraphSequence;
3597  for (TInt i = 0; i < RowIdBuckets.Len(); i++) {
3598  if (RowIdBuckets[i].Len() == 0) { continue; }
3599  PNEANet PNet = BuildGraph(RowIdBuckets[i], AggrPolicy);
3600  GraphSequence.Add(PNet);
3601  }
3603  return GraphSequence;
3604 }
3607  CurrBucket = -1;
3608  this->AggrPolicy = AggrPolicy;
3609  return GetNextGraphFromSequence();
3610 }
3613  CurrBucket++;
3614  while (CurrBucket < RowIdBuckets.Len() && RowIdBuckets[CurrBucket].Len() == 0) {
3615  CurrBucket++;
3616  }
3617  if (CurrBucket >= RowIdBuckets.Len()) { return NULL; }
3619 }
3621 // Only integer SplitAttr supported
3622 // Setting JumpSize = WindowSize will give disjoint windows
3623 // Setting JumpSize < WindowSize will give sliding windows
3624 // Setting JumpSize > WindowSize will drop certain rows (currently not supported)
3625 // Setting JumpSize = 0 will give expanding windows (i.e. starting at 0 and ending at i*WindowSize)
3626 // To set the range of values of SplitAttr to be considered, use StartVal and EndVal (inclusive)
3627 // If StartVal == TInt.Mn, then the buckets will start from the min value of SplitAttr in the table.
3628 // If EndVal == TInt.Mx, then the buckets will end at the max value of SplitAttr in the table.
3629 TVec<PNEANet> TTable::ToGraphSequence(TStr SplitAttr, TAttrAggr AggrPolicy, TInt WindowSize, TInt JumpSize, TInt StartVal, TInt EndVal) {
3630  FillBucketsByWindow(SplitAttr, JumpSize, WindowSize, StartVal, EndVal);
3631  printf("buckets filled\n");
3632  return GetGraphsFromSequence(AggrPolicy);
3633 }
3635 TVec<PNEANet> TTable::ToVarGraphSequence(TStr SplitAttr, TAttrAggr AggrPolicy, TIntPrV SplitIntervals) {
3636  FillBucketsByInterval(SplitAttr, SplitIntervals);
3637  return GetGraphsFromSequence(AggrPolicy);
3638 }
3641  return ToGraphSequence(GroupAttr, AggrPolicy, TInt(1), TInt(1), TInt::Mn, TInt::Mx);
3642 }
3644 PNEANet TTable::ToGraphSequenceIterator(TStr SplitAttr, TAttrAggr AggrPolicy, TInt WindowSize, TInt JumpSize, TInt StartVal, TInt EndVal) {
3645  FillBucketsByWindow(SplitAttr, JumpSize, WindowSize, StartVal, EndVal);
3646  return GetFirstGraphFromSequence(AggrPolicy);
3647 }
3649 PNEANet TTable::ToVarGraphSequenceIterator(TStr SplitAttr, TAttrAggr AggrPolicy, TIntPrV SplitIntervals) {
3650  FillBucketsByInterval(SplitAttr, SplitIntervals);
3651  return GetFirstGraphFromSequence(AggrPolicy);
3652 }
3655  return ToGraphSequenceIterator(GroupAttr, AggrPolicy, TInt(1), TInt(1), TInt::Mn, TInt::Mx);
3656 }
3658 // calls to this must be preceded by a call to one of the above ToGraph*Iterator functions
3660  return GetNextGraphFromSequence();
3661 }
3664  return CurrBucket >= RowIdBuckets.Len() - 1;
3665 }
3668  Schema SR;
3669  SR.Add(TPair<TStr,TAttrType>("node_id",atInt));
3671  TStrV IntAttrNames;
3672  TStrV FltAttrNames;
3673  TStrV StrAttrNames;
3675  TNEANet::TNodeI NodeI = Network->BegNI();
3676  NodeI.GetIntAttrNames(IntAttrNames);
3677  NodeI.GetFltAttrNames(FltAttrNames);
3678  NodeI.GetStrAttrNames(StrAttrNames);
3679  for (TInt i = 0; i < IntAttrNames.Len(); i++) {
3680  SR.Add(TPair<TStr,TAttrType>(IntAttrNames[i],atInt));
3681  }
3682  for (TInt i = 0; i < FltAttrNames.Len(); i++) {
3683  SR.Add(TPair<TStr,TAttrType>(FltAttrNames[i],atFlt));
3684  }
3685  for (TInt i = 0; i < StrAttrNames.Len(); i++) {
3686  SR.Add(TPair<TStr,TAttrType>(StrAttrNames[i],atStr));
3687  }
3689  PTable T = New(SR, Context);
3691  TInt Cnt = 0;
3692  // populate table columns
3693  while (NodeI < Network->EndNI()) {
3694  T->IntCols[0].Add(NodeI.GetId());
3695  for (TInt i = 0; i < IntAttrNames.Len(); i++) {
3696  T->IntCols[i+1].Add(Network->GetIntAttrDatN(NodeI,IntAttrNames[i]));
3697  }
3698  for (TInt i = 0; i < FltAttrNames.Len(); i++) {
3699  T->FltCols[i].Add(Network->GetFltAttrDatN(NodeI,FltAttrNames[i]));
3700  }
3701  for (TInt i = 0; i < StrAttrNames.Len(); i++) {
3702  T->AddStrVal(i, Network->GetStrAttrDatN(NodeI,StrAttrNames[i]));
3703  }
3704  Cnt++;
3705  NodeI++;
3706  }
3707  // set number of rows and "Next" vector
3708  T->NumRows = Cnt;
3709  T->NumValidRows = T->NumRows;
3710  T->Next = TIntV(T->NumRows,0);
3711  for (TInt i = 0; i < T->NumRows-1; i++) {
3712  T->Next.Add(i+1);
3713  }
3714  T->LastValidRow = T->NumRows-1;
3715  T->Next.Add(Last);
3716  return T;
3717 }
3720  Schema SR;
3721  SR.Add(TPair<TStr,TAttrType>("edg_id",atInt));
3722  SR.Add(TPair<TStr,TAttrType>("src_id",atInt));
3723  SR.Add(TPair<TStr,TAttrType>("dst_id",atInt));
3725  TStrV IntAttrNames;
3726  TStrV FltAttrNames;
3727  TStrV StrAttrNames;
3729  TNEANet::TEdgeI EdgeI = Network->BegEI();
3730  EdgeI.GetIntAttrNames(IntAttrNames);
3731  EdgeI.GetFltAttrNames(FltAttrNames);
3732  EdgeI.GetStrAttrNames(StrAttrNames);
3733  for (TInt i = 0; i < IntAttrNames.Len(); i++) {
3734  SR.Add(TPair<TStr,TAttrType>(IntAttrNames[i],atInt));
3735  }
3736  for (TInt i = 0; i < FltAttrNames.Len(); i++) {
3737  SR.Add(TPair<TStr,TAttrType>(FltAttrNames[i],atFlt));
3738  }
3739  for (TInt i = 0; i < StrAttrNames.Len(); i++) {
3740  //printf("%s\n",StrAttrNames[i].CStr());
3741  SR.Add(TPair<TStr,TAttrType>(StrAttrNames[i],atStr));
3742  }
3744  PTable T = New(SR, Context);
3746  TInt Cnt = 0;
3747  // populate table columns
3748  while (EdgeI < Network->EndEI()) {
3749  T->IntCols[0].Add(EdgeI.GetId());
3750  T->IntCols[1].Add(EdgeI.GetSrcNId());
3751  T->IntCols[2].Add(EdgeI.GetDstNId());
3752  for (TInt i = 0; i < IntAttrNames.Len(); i++) {
3753  T->IntCols[i+3].Add(Network->GetIntAttrDatE(EdgeI,IntAttrNames[i]));
3754  }
3755  for (TInt i = 0; i < FltAttrNames.Len(); i++) {
3756  T->FltCols[i].Add(Network->GetFltAttrDatE(EdgeI,FltAttrNames[i]));
3757  }
3758  for (TInt i = 0; i < StrAttrNames.Len(); i++) {
3759  T->AddStrVal(i, Network->GetStrAttrDatE(EdgeI,StrAttrNames[i]));
3760  }
3761  Cnt++;
3762  EdgeI++;
3763  }
3764  // set number of rows and "Next" vector
3765  T->NumRows = Cnt;
3766  T->NumValidRows = T->NumRows;
3767  T->Next = TIntV(T->NumRows,0);
3768  for (TInt i = 0; i < T->NumRows-1; i++) {
3769  T->Next.Add(i+1);
3770  }
3771  T->LastValidRow = T->NumRows-1;
3772  T->Next.Add(Last);
3773  return T;
3774 }
3776 #ifdef GCC_ATOMIC
3778  Schema SR;
3779  SR.Add(TPair<TStr,TAttrType>("src_id",atInt));
3780  SR.Add(TPair<TStr,TAttrType>("dst_id",atInt));
3782  TNGraphMP::TEdgeI FirstEI = Network->BegEI();
3783  PTable T = New(SR, Context);
3784  TInt NumEdges = Network->GetEdges();
3785  TInt NumPartitions = omp_get_max_threads()*CHUNKS_PER_THREAD;
3786  TInt PartitionSize = NumEdges/NumPartitions;
3787  if (PartitionSize*NumPartitions < NumEdges) { NumPartitions++;}
3790  TVec<TEIPr> Partitions;
3791  TIntV PartitionSizes;
3792  TNGraphMP::TEdgeI currStart = FirstEI;
3793  TInt currCount = 0;
3794  while (FirstEI < Network->EndEI()){
3795  if (currCount == PartitionSize) {
3796  Partitions.Add(TEIPr(currStart, FirstEI));
3797  currStart = FirstEI;
3798  PartitionSizes.Add(currCount);
3799  //printf("added: %d\n", currCount.Val);
3800  currCount = 0;
3801  }
3802  //printf("%d\n", currCount.Val);
3803  FirstEI++;
3804  currCount++;
3805  }
3806  Partitions.Add(TEIPr(currStart, FirstEI));
3807  PartitionSizes.Add(currCount);
3809  T->ResizeTable(NumEdges);
3810  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD)
3811  for (int p = 0; p < Partitions.Len(); p++) {
3812  TNGraphMP::TEdgeI EdgeI = Partitions[p].GetVal1();
3813  TNGraphMP::TEdgeI EndI = Partitions[p].GetVal2();
3814  //printf("Thread = %d, p = %d, size = %d\n", omp_get_thread_num(), p, PartitionSizes[p].Val);
3815  int start = T->GetEmptyRowsStart(PartitionSizes[p]);
3816  while (EdgeI < EndI) {
3817  T->IntCols[0][start] = EdgeI.GetSrcNId();
3818  T->IntCols[1][start] = EdgeI.GetDstNId();
3819  EdgeI++;
3820  if (EdgeI < EndI) { T->Next[start] = start+1;}
3821  start++;
3822  }
3823  }
3825  Assert(T->NumRows == NumEdges);
3826  return T;
3827 }
3828 #endif // GCC_ATOMIC
3830 PTable TTable::GetFltNodePropertyTable(const PNEANet& Network, const TIntFltH& Property,
3831  const TStr& NodeAttrName, const TAttrType& NodeAttrType, const TStr& PropertyAttrName,
3832  TTableContext* Context) {
3833  Schema SR;
3834  // Determine type of node id
3835  SR.Add(TPair<TStr,TAttrType>(NodeAttrName,NodeAttrType));
3836  SR.Add(TPair<TStr,TAttrType>(PropertyAttrName,atFlt));
3837  PTable T = New(SR, Context);
3838  TInt NodeColIdx = T->GetColIdx(NodeAttrName);
3839  TInt Cnt = 0;
3840  // populate table columns
3841  for (TNEANet::TNodeI NodeI = Network->BegNI(); NodeI < Network->EndNI(); NodeI++) {
3842  switch (NodeAttrType) {
3843  case atInt:
3844  T->IntCols[NodeColIdx].Add(Network->GetIntAttrDatN(NodeI,NodeAttrName));
3845  break;
3846  case atFlt:
3847  T->FltCols[NodeColIdx].Add(Network->GetFltAttrDatN(NodeI,NodeAttrName));
3848  break;
3849  case atStr:
3850  T->AddStrVal(TInt(0), Network->GetStrAttrDatN(NodeI,NodeAttrName));
3851  break;
3852  }
3853  T->FltCols[0].Add(Property.GetDat(NodeI.GetId()));
3854  Cnt++;
3855  }
3856  // set number of rows and "Next" vector
3857  T->NumRows = Cnt;
3858  T->NumValidRows = T->NumRows;
3859  T->Next = TIntV(T->NumRows,0);
3860  for (TInt i = 0; i < T->NumRows-1; i++) {
3861  T->Next.Add(i+1);
3862  }
3863  T->LastValidRow = T->NumRows-1;
3864  T->Next.Add(Last);
3865  return T;
3866 }
3868 /*** Special Filters ***/
3869 PTable TTable::IsNextK(const TStr& OrderCol, TInt K, const TStr& GroupBy, const TStr& RankColName) {
3870  TStrV OrderBy;
3871  if (GroupBy.Empty()) {
3872  OrderBy.Add(OrderCol);
3873  } else {
3874  OrderBy.Add(GroupBy);
3875  OrderBy.Add(OrderCol);
3876  }
3877  if (RankColName.Empty()) {
3878  Order(OrderBy);
3879  } else {
3880  Order(OrderBy, RankColName, true);
3881  }
3882  TAttrType GroupByAttrType = GetColType(GroupBy);
3883  PTable T = InitializeJointTable(*this);
3884  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
3885  TInt Succ = RI.GetRowIdx();
3886  TBool OutOfGroup = false;
3887  for (TInt i = 0; i < K; i++) {
3888  Succ = Next[Succ];
3889  if (Succ == Last) { break; }
3890  switch (GroupByAttrType) {
3891  case atInt:
3892  if (GetIntVal(GroupBy, Succ) != RI.GetIntAttr(GroupBy)) { OutOfGroup = true; }
3893  break;
3894  case atFlt:
3895  if (GetFltVal(GroupBy, Succ) != RI.GetFltAttr(GroupBy)) { OutOfGroup = true; }
3896  break;
3897  case atStr:
3898  if (GetStrVal(GroupBy, Succ) != RI.GetStrAttr(GroupBy)) { OutOfGroup = true; }
3899  break;
3900  }
3901  if (OutOfGroup) { break; } // break out of inner for loop
3902  T->AddJointRow(*this, *this, RI.GetRowIdx(), Succ);
3903  }
3904  }
3905  return T;
3906 }
3909  printf("Total number of rows: %d\n", NumRows.Val);
3910  printf("Number of valid rows: %d\n", NumValidRows.Val);
3911  printf("Number of Int columns: %d\n", IntCols.Len());
3912  printf("Number of Flt columns: %d\n", FltCols.Len());
3913  printf("Number of Str columns: %d\n", StrColMaps.Len());
3914  TSize MemUsed = GetMemUsedKB();
3915  printf("Approximated size is %lu KB\n", MemUsed);
3916 }
3919  TSize ApproxSize = 0;
3920  ApproxSize += Next.GetMemUsed()/1000; // Next vector
3921  for(int i = 0; i < IntCols.Len(); i++){
3922  ApproxSize += IntCols[i].GetMemUsed()/1000;
3923  }
3924  for(int i = 0; i < FltCols.Len(); i++){
3925  ApproxSize += FltCols[i].GetMemUsed()/1000;
3926  }
3927  for(int i = 0; i < StrColMaps.Len(); i++){
3928  ApproxSize += StrColMaps[i].GetMemUsed()/1000;
3929  }
3930  ApproxSize += RowIdMap.GetMemUsed()/1000;
3931  ApproxSize += GroupIDMapping.GetMemUsed()/1000;
3932  ApproxSize += GroupMapping.GetMemUsed()/1000;
3933  ApproxSize += RowIdBuckets.GetMemUsed() / 1000;
3934  return ApproxSize;
3935 }
3938  printf("Number of strings in pool: ");
3939  printf("%d\n", Context->StringVals.Len());
3940  printf("Number of entries in hash table: ");
3941  printf("%d\n", Context->StringVals.Reserved());
3942  TSize MemUsed = GetContextMemUsedKB();
3943  printf("Approximate memory used for Context: %lu KB\n", MemUsed);
3944 }
3947  TSize ApproxSize = 0;
3948  ApproxSize += Context->StringVals.GetMemUsed();
3949  return ApproxSize;
3950 }
3952 void TTable::AddTable(const TTable& T) {
3953  //for (TInt c = 0; c < S.Len(); c++) {
3954  // if (S[c] != T.S[c]) { printf("(%s,%d) != (%s,%d)\n", S[c].Val1.CStr(), S[c].Val2, T.S[c].Val1.CStr(), T.S[c].Val2); TExcept::Throw("when adding tables, their schemas must match!"); }
3955  //}
3956  for (TInt c = 0; c < Sch.Len(); c++) {
3957  TStr ColName = GetSchemaColName(c);
3958  TInt ColIdx = GetColIdx(ColName);
3959  TInt TColIdx = ColName == IdColName ? T.GetColIdx(T.IdColName) : T.GetColIdx(ColName);
3960  if (TColIdx < 0) { TExcept::Throw("when adding a table, it must contain all columns of source table!"); }
3961  switch (GetColType(ColName)) {
3962  case atInt:
3963  IntCols[ColIdx].AddV(T.IntCols[TColIdx]);
3964  break;
3965  case atFlt:
3966  FltCols[ColIdx].AddV(T.FltCols[TColIdx]);
3967  break;
3968  case atStr:
3969  StrColMaps[ColIdx].AddV(T.StrColMaps[TColIdx]);
3970  break;
3971  }
3972  }
3974  TIntV TNext(T.Next);
3975  for (TInt i = 0; i < TNext.Len(); i++) {
3976  if (TNext[i] != Last && TNext[i] != Invalid) { TNext[i] += NumRows; }
3977  }
3979  Next.AddV(TNext);
3980  // checks if table is empty
3981  if (LastValidRow >= 0) {
3983  }
3985  NumRows += T.NumRows;
3987 }
3989 // returns physical indices of rows of given table present in our table
3990 // we assume that schema matches exactly (including index of id cols)
3991 void TTable::GetCollidingRows(const TTable& Table, THashSet<TInt>& Collisions) {
3992  TIntV UniqueVec;
3994  TStrV GroupBy;
3996  // indices of columns of each type
3997  TIntV IntGroupByCols;
3998  TIntV FltGroupByCols;
3999  TIntV StrGroupByCols;
4001  TInt IKLen, FKLen, SKLen;
4003  // check that schemas match
4004  for (TInt c = 0; c < Sch.Len(); c++) {
4005  if (Sch[c].Val1 == IdColName) {
4006  if (Table.Sch[c].Val1 != Table.GetIdColName()) {
4007  TExcept::Throw("GetCollidingRows: schemas do not match!");
4008  }
4009  continue;
4010  }
4011  if (Sch[c] != Table.Sch[c]) {
4012  printf("(%s,%d) != (%s,%d)\n", Sch[c].Val1.CStr(), Sch[c].Val2, Table.Sch[c].Val1.CStr(), Table.Sch[c].Val2);
4013  TExcept::Throw("GetCollidingRows: schemas do not match!");
4014  }
4015  GroupBy.Add(NormalizeColName(Sch[c].Val1));
4016  TPair<TAttrType, TInt> ColType = Table.GetColTypeMap(Sch[c].Val1);
4017  switch (ColType.Val1) {
4018  case atInt:
4019  IntGroupByCols.Add(ColType.Val2);
4020  break;
4021  case atFlt:
4022  FltGroupByCols.Add(ColType.Val2);
4023  break;
4024  case atStr:
4025  StrGroupByCols.Add(ColType.Val2);
4026  break;
4027  }
4028  }
4030  IKLen = IntGroupByCols.Len();
4031  FKLen = FltGroupByCols.Len();
4032  SKLen = StrGroupByCols.Len();
4034  // group rows of first table
4035  GroupAux(GroupBy, Grouping, true, "", false, UniqueVec, true);
4037  // find colliding rows of second table
4038  for (TRowIterator it = Table.BegRI(); it < Table.EndRI(); it++) {
4039  // read keys from row
4040  TIntV IKey(IKLen + SKLen, 0);
4041  TFltV FKey(FKLen, 0);
4043  // find group key
4044  for (TInt c = 0; c < IKLen; c++) {
4045  IKey.Add(it.GetIntAttr(IntGroupByCols[c]));
4046  }
4047  for (TInt c = 0; c < FKLen; c++) {
4048  FKey.Add(it.GetFltAttr(FltGroupByCols[c]));
4049  }
4050  for (TInt c = 0; c < SKLen; c++) {
4051  IKey.Add(it.GetStrMapById(StrGroupByCols[c]));
4052  }
4053  // look for group matching the key
4054  TGroupKey GroupKey = TGroupKey(IKey, FKey);
4056  TInt RowIdx = it.GetRowIdx();
4057  if (Grouping.IsKey(GroupKey)) {
4058  // row exists in first table
4059  Collisions.AddKey(RowIdx);
4060  }
4061  }
4062 }
4064 void TTable::StoreIntCol(const TStr& ColName, const TIntV& ColVals) {
4065  if (ColVals.Len() != NumRows) {
4066  printf("new column dimension must agree with number of rows\n");
4067  return;
4068  }
4069  AddSchemaCol(ColName, atInt);
4071  TInt ColIdx = IntCols.Len()-1;
4072  TInt i = 0;
4073  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
4074  IntCols[ColIdx][RI.GetRowIdx()] = ColVals[i];
4075  i++;
4076  }
4077  TInt L = IntCols.Len();
4078  AddColType(ColName, atInt, L-1);
4079 }
4081 void TTable::StoreFltCol(const TStr& ColName, const TFltV& ColVals) {
4082  if (ColVals.Len() != NumRows) {
4083  printf("new column dimension must agree with number of rows\n");
4084  return;
4085  }
4086  AddSchemaCol(ColName, atFlt);
4087  FltCols.Add(TFltV(NumRows));
4088  TInt ColIdx = FltCols.Len()-1;
4089  TInt i = 0;
4090  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
4091  FltCols[ColIdx][RI.GetRowIdx()] = ColVals[i];
4092  i++;
4093  }
4094  TInt L = FltCols.Len();
4095  AddColType(ColName, atFlt, L-1);
4096 }
4098 void TTable::StoreStrCol(const TStr& ColName, const TStrV& ColVals) {
4099  if (ColVals.Len() != NumRows) {
4100  printf("new column dimension must agree with number of rows\n");
4101  return;
4102  }
4103  AddSchemaCol(ColName, atStr);
4105  TInt ColIdx = FltCols.Len()-1;
4106  TInt i = 0;
4107  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
4108  TInt Key = Context->StringVals.GetKeyId(ColVals[i]);
4109  if (Key == -1) { Context->StringVals.AddKey(ColVals[i]); }
4110  StrColMaps[ColIdx][RI.GetRowIdx()] = Key;
4111  i++;
4112  }
4113  TInt L = StrColMaps.Len();
4114  AddColType(ColName, atStr, L-1);
4115 }
4118  if (LastValidRow >= 0) {
4120  }
4121  Next.Add(Last);
4124  NumRows++;
4125  NumValidRows++;
4126 }
4128 #ifdef GCC_ATOMIC
4129 void TTable::SetFltColToConstMP(TInt UpdateColIdx, TFlt DefaultFltVal){
4130  if(!GetMP()){ TExcept::Throw("Not Using MP!");}
4131  TIntPrV Partitions;
4132  GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
4133  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
4134  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD)
4135  for (int i = 0; i < Partitions.Len(); i++){
4136  TRowIterator RowI(Partitions[i].GetVal1(), this);
4137  TRowIterator EndI(Partitions[i].GetVal2(), this);
4138  while(RowI < EndI){
4139  FltCols[UpdateColIdx][RowI.GetRowIdx()] = DefaultFltVal;
4140  RowI++;
4141  }
4142  }
4143 }
4145 // OP RS 2016/06/30: this wrapper function is required
4146 // for the code to compile on Mac OS X gcc 4.2.1
4148  return(__sync_bool_compare_and_swap(lock, 0, 1));
4149 }
4151 void TTable::UpdateFltFromTableMP(const TStr& KeyAttr, const TStr& UpdateAttr,
4152  const TTable& Table, const TStr& FKeyAttr, const TStr& ReadAttr,
4153  TFlt DefaultFltVal) {
4154  if (!GetMP()) {
4155  TExcept::Throw("Not Using MP!");
4156  }
4158  TAttrType KeyType = GetColType(KeyAttr);
4159  TAttrType FKeyType = Table.GetColType(FKeyAttr);
4160  if(KeyType != FKeyType){TExcept::Throw("Key Type Mismatch");}
4161  if(GetColType(UpdateAttr) != atFlt || Table.GetColType(ReadAttr) != atFlt){
4162  TExcept::Throw("Expecting Float values");
4163  }
4164  TStr NKeyAttr = NormalizeColName(KeyAttr);
4165  //TStr NUpdateAttr = NormalizeColName(UpdateAttr);
4166  //TStr NFKeyAttr = Table.NormalizeColName(FKeyAttr);
4167  //TStr NReadAttr = Table.NormalizeColName(ReadAttr);
4168  TInt UpdateColIdx = GetColIdx(UpdateAttr);
4169  TInt FKeyColIdx = GetColIdx(FKeyAttr);
4170  TInt ReadColIdx = GetColIdx(ReadAttr);
4172  // TODO: this should be a generic vector operation
4173  SetFltColToConstMP(UpdateColIdx, DefaultFltVal);
4175  TIntPrV Partitions;
4176  Table.GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
4177  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
4178  TIntV Locks(NumRows);
4179  Locks.PutAll(0); // need to parallelize this...
4181  switch (KeyType) {
4182  // TODO: add support for other cases of KeyType
4183  case atInt: {
4184  THashMP<TInt,TIntV> Grouping;
4185  // must use physical row ids
4186  GroupByIntColMP(NKeyAttr, Grouping, true);
4187  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD) // num_threads(1)
4188  for (int i = 0; i < Partitions.Len(); i++) {
4189  TRowIterator RowI(Partitions[i].GetVal1(), &Table);
4190  TRowIterator EndI(Partitions[i].GetVal2(), &Table);
4191  while (RowI < EndI) {
4192  TInt K = RowI.GetIntAttr(FKeyColIdx);
4193  if (Grouping.IsKey(K)) {
4194  TIntV& UpdateRows = Grouping.GetDat(K);
4195  for (int j = 0; j < UpdateRows.Len(); j++) {
4196  int* lock = &Locks[UpdateRows[j]].Val;
4197  // OP RS 2016/06/30: needed to define a wrapper function
4198  // for the code to compile on Mac OS X gcc 4.2.1
4199  //if (!__sync_bool_compare_and_swap(lock, 0, 1)) {
4200  if (!sync_bool_compare_and_swap(lock)) {
4201  continue;
4202  }
4203  //printf("key = %d, row = %d, old_score = %f\n", K.Val, j, UpdateRows[j].Val, FltCols[UpdateColIdx][UpdateRows[j]].Val);
4204  FltCols[UpdateColIdx][UpdateRows[j]] = RowI.GetFltAttr(ReadColIdx);
4205  //printf("key = %d, new_score = %f\n", K.Val, j, FltCols[UpdateColIdx][UpdateRows[j]].Val);
4206  } // end of for loop
4207  } // end of if statement
4208  RowI++;
4209  } // end of while loop
4210  } // end of for loop
4211  } // end of case atInt
4212  break;
4213  default:
4214  break;
4215  } // end of outer switch statement
4216 }
4217 #endif // GCC_ATOMIC
4219 void TTable::UpdateFltFromTable(const TStr& KeyAttr, const TStr& UpdateAttr, const TTable& Table,
4220  const TStr& FKeyAttr, const TStr& ReadAttr, TFlt DefaultFltVal){
4221  if(!IsColName(KeyAttr)){ TExcept::Throw("Bad KeyAttr parameter");}
4222  if(!IsColName(UpdateAttr)){ TExcept::Throw("Bad UpdateAttr parameter");}
4223  if(!Table.IsColName(FKeyAttr)){ TExcept::Throw("Bad FKeyAttr parameter");}
4224  if(!Table.IsColName(ReadAttr)){ TExcept::Throw("Bad ReadAttr parameter");}
4226 #ifdef GCC_ATOMIC
4227  if(GetMP()){
4228  UpdateFltFromTableMP(KeyAttr, UpdateAttr,Table, FKeyAttr, ReadAttr, DefaultFltVal);
4229  return;
4230  }
4231 #endif // GCC_ATOMIC
4233  TAttrType KeyType = GetColType(KeyAttr);
4234  TAttrType FKeyType = Table.GetColType(FKeyAttr);
4235  if(KeyType != FKeyType){TExcept::Throw("Key Type Mismatch");}
4236  if(GetColType(UpdateAttr) != atFlt || Table.GetColType(ReadAttr) != atFlt){
4237  TExcept::Throw("Expecting Float values");
4238  }
4239  TStr NKeyAttr = NormalizeColName(KeyAttr);
4240  TStr NUpdateAttr = NormalizeColName(UpdateAttr);
4241  TStr NFKeyAttr = Table.NormalizeColName(FKeyAttr);
4242  TStr NReadAttr = Table.NormalizeColName(ReadAttr);
4243  TInt UpdateColIdx = GetColIdx(UpdateAttr);
4245  for(TRowIterator iter = BegRI(); iter < EndRI(); iter++){
4246  FltCols[UpdateColIdx][iter.GetRowIdx()] = DefaultFltVal;
4247  }
4249  switch(KeyType) {
4250  // TODO: add support for other cases of KeyType
4251  case atInt: {
4252  TIntIntVH Grouping;
4253  GroupByIntCol(NKeyAttr, Grouping, TIntV(), true, true);
4254  for (TRowIterator RI = Table.BegRI(); RI < Table.EndRI(); RI++) {
4255  TInt K = RI.GetIntAttr(NFKeyAttr);
4256  if (Grouping.IsKey(K)) {
4257  TIntV& UpdateRows = Grouping.GetDat(K);
4258  for (int i = 0; i < UpdateRows.Len(); i++) {
4259  FltCols[UpdateColIdx][UpdateRows[i]] = RI.GetFltAttr(NReadAttr);
4260  } // end of for loop
4261  } // end of if statement
4262  } // end of for loop
4263  } // end of case atInt
4264  break;
4265  default:
4266  break;
4267  } // end of outer switch statement
4268 }
4271 // can ONLY be called when a table is being initialised (before IDs are allocated)
4272 void TTable::AddRow(const TRowIterator& RI) {
4273  for (TInt c = 0; c < Sch.Len(); c++) {
4274  TStr ColName = GetSchemaColName(c);
4275  if (ColName == IdColName) { continue; }
4277  TInt ColIdx = GetColIdx(ColName);
4279  switch (GetColType(ColName)) {
4280  case atInt:
4281  IntCols[ColIdx].Add(RI.GetIntAttr(ColName));
4282  break;
4283  case atFlt:
4284  FltCols[ColIdx].Add(RI.GetFltAttr(ColName));
4285  break;
4286  case atStr:
4287  StrColMaps[ColIdx].Add(RI.GetStrMapByName(ColName));
4288  break;
4289  }
4290  }
4292 }
4294 void TTable::AddRow(const TIntV& IntVals, const TFltV& FltVals, const TStrV& StrVals) {
4295  for (TInt c = 0; c < IntVals.Len(); c++) {
4296  IntCols[c].Add(IntVals[c]);
4297  }
4298  for (TInt c = 0; c < FltVals.Len(); c++) {
4299  FltCols[c].Add(FltVals[c]);
4300  }
4301  for (TInt c = 0; c < StrVals.Len(); c++) {
4302  AddStrVal(c, StrVals[c]);
4303  }
4305 }
4307 void TTable::ResizeTable(int RowCount) {
4308  if (RowCount == 0) {
4309  // initialize empty table
4310  NumValidRows = 0;
4313  }
4314  if (Next.Len() < RowCount) {
4315  TInt FltOffset = IntCols.Len();
4316  TInt StrOffset = FltOffset + FltCols.Len();
4317  TInt TotalCols = StrOffset + StrColMaps.Len();
4318 #ifdef USE_OPENMP
4319  #pragma omp parallel for schedule(static)
4320 #endif
4321  for (int i = 0; i < TotalCols+1; i++) {
4322  if (i < FltOffset) {
4323  IntCols[i].Reserve(RowCount, RowCount);
4324  } else if (i < StrOffset) {
4325  FltCols[i-FltOffset].Reserve(RowCount, RowCount);
4326  } else if (i < TotalCols) {
4327  StrColMaps[i-StrOffset].Reserve(RowCount, RowCount);
4328  } else {
4329  Next.Reserve(RowCount, RowCount);
4330  }
4331  }
4332  } else if (Next.Len() > RowCount) {
4333  TInt FltOffset = IntCols.Len();
4334  TInt StrOffset = FltOffset + FltCols.Len();
4335  TInt TotalCols = StrOffset + StrColMaps.Len();
4336 #ifdef USE_OPENMP
4337  #pragma omp parallel for schedule(static)
4338 #endif
4339  for (int i = 0; i < TotalCols+1; i++) {
4340  if (i < FltOffset) {
4341  IntCols[i].Trunc(RowCount);
4342  } else if (i < StrOffset) {
4343  FltCols[i-FltOffset].Trunc(RowCount);
4344  } else if (i < TotalCols) {
4345  StrColMaps[i-StrOffset].Trunc(RowCount);
4346  } else {
4347  Next.Trunc(RowCount);
4348  }
4349  }
4350  }
4351 }
4353 int TTable::GetEmptyRowsStart(int NewRows) {
4354  int start = -1;
4355 #ifdef USE_OPENMP
4356  #pragma omp critical
4357  {
4358 #endif
4359  start = NumRows;
4360  NumRows += NewRows;
4361  NumValidRows += NewRows;
4362  // To make this function thread-safe, the following call must be done before the
4363  // code enters parallel region.
4364  // ResizeTable(NumRows);
4365  Assert(NumRows <= Next.Len());
4366  if (LastValidRow >= 0) {Next[LastValidRow] = start;}
4367  LastValidRow = start+NewRows-1;
4368  Next[LastValidRow] = Last;
4369 #ifdef USE_OPENMP
4370  }
4371 #endif
4372  Assert (start >= 0);
4373  return start;
4374 }
4376 void TTable::AddSelectedRows(const TTable& Table, const TIntV& RowIDs) {
4377  int NewRows = RowIDs.Len();
4378  if (NewRows == 0) { return; }
4379  // this call should be thread-safe
4380  int start = GetEmptyRowsStart(NewRows);
4381  for (TInt r = 0; r < NewRows; r++) {
4382  TInt CurrRowIdx = RowIDs[r];
4383  for (TInt i = 0; i < Table.IntCols.Len(); i++) {
4384  IntCols[i][start+r] = Table.IntCols[i][CurrRowIdx];
4385  }
4386  for (TInt i = 0; i < Table.FltCols.Len(); i++) {
4387  FltCols[i][start+r] = Table.FltCols[i][CurrRowIdx];
4388  }
4389  for (TInt i = 0; i < Table.StrColMaps.Len(); i++) {
4390  StrColMaps[i][start+r] = Table.StrColMaps[i][CurrRowIdx];
4391  }
4392  }
4393  for (TInt r = 0; r < NewRows-1; r++) {
4394  Next[start+r] = start+r+1;
4395  }
4396 }
4398 void TTable::AddNRows(int NewRows, const TVec<TIntV>& IntColsP, const TVec<TFltV>& FltColsP, const TVec<TIntV>& StrColMapsP) {
4399  if (NewRows == 0) { return; }
4400  // this call should be thread-safe
4401  int start = GetEmptyRowsStart(NewRows);
4402  for (TInt r = 0; r < NewRows; r++) {
4403  for (TInt i = 0; i < IntColsP.Len(); i++) {
4404  IntCols[i][start+r] = IntColsP[i][r];
4405  }
4406  for (TInt i = 0; i < FltColsP.Len(); i++) {
4407  FltCols[i][start+r] = FltColsP[i][r];
4408  }
4409  for (TInt i = 0; i < StrColMapsP.Len(); i++) {
4410  StrColMaps[i][start+r] = StrColMapsP[i][r];
4411  }
4412  }
4413  for (TInt r = 0; r < NewRows-1; r++) {
4414  Next[start+r] = start+r+1;
4415  }
4416 }
4418 #ifdef USE_OPENMP
4419 void TTable::AddNJointRowsMP(const TTable& T1, const TTable& T2, const TVec<TIntPrV>& JointRowIDSet) {
4420  //double startFn = omp_get_wtime();
4421  int JointTableSize = 0;
4422  TIntV StartOffsets(JointRowIDSet.Len());
4423  for (int i = 0; i < JointRowIDSet.Len(); i++) {
4424  StartOffsets[i] = JointTableSize;
4425  JointTableSize += JointRowIDSet[i].Len();
4426  }
4427  if (JointTableSize == 0) {
4428  TExcept::Throw("Joint table is empty");
4429  }
4430  //double endOffsets = omp_get_wtime();
4431  //printf("Offsets time = %f\n",endOffsets-startFn);
4432  ResizeTable(JointTableSize);
4433  //double endResize = omp_get_wtime();
4434  //printf("Resize time = %f\n",endResize-endOffsets);
4435  NumRows = JointTableSize;
4436  NumValidRows = JointTableSize;
4437  Assert(NumRows <= Next.Len());
4439  TInt IntOffset = T1.IntCols.Len();
4440  TInt FltOffset = T1.FltCols.Len();
4441  TInt StrOffset = T1.StrColMaps.Len();
4443  TInt IdOffset = IntOffset + T2.IntCols.Len();
4444  RowIdMap.Clr();
4445  for (TInt IdCnt = 0; IdCnt < JointTableSize; IdCnt++) {
4446  RowIdMap.AddDat(IdCnt, IdCnt);
4447  }
4449  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD)
4450  for (int j = 0; j < JointRowIDSet.Len(); j++) {
4451  const TIntPrV& RowIDs = JointRowIDSet[j];
4452  int start = StartOffsets[j];
4453  int NewRows = RowIDs.Len();
4454  if (NewRows == 0) {continue;}
4455  for (TInt r = 0; r < NewRows; r++){
4456  TIntPr CurrRowIdPr = RowIDs[r];
4457  for(TInt i = 0; i < T1.IntCols.Len(); i++){
4458  IntCols[i][start+r] = T1.IntCols[i][CurrRowIdPr.GetVal1()];
4459  }
4460  for(TInt i = 0; i < T1.FltCols.Len(); i++){
4461  FltCols[i][start+r] = T1.FltCols[i][CurrRowIdPr.GetVal1()];
4462  }
4463  for(TInt i = 0; i < T1.StrColMaps.Len(); i++){
4464  StrColMaps[i][start+r] = T1.StrColMaps[i][CurrRowIdPr.GetVal1()];
4465  }
4466  for(TInt i = 0; i < T2.IntCols.Len(); i++){
4467  IntCols[i+IntOffset][start+r] = T2.IntCols[i][CurrRowIdPr.GetVal2()];
4468  }
4469  for(TInt i = 0; i < T2.FltCols.Len(); i++){
4470  FltCols[i+FltOffset][start+r] = T2.FltCols[i][CurrRowIdPr.GetVal2()];
4471  }
4472  for(TInt i = 0; i < T2.StrColMaps.Len(); i++){
4473  StrColMaps[i+StrOffset][start+r] = T2.StrColMaps[i][CurrRowIdPr.GetVal2()];
4474  }
4475  IntCols[IdOffset][start+r] = start+r;
4476  }
4477  for(TInt r = 0; r < NewRows; r++){
4478  Next[start+r] = start+r+1;
4479  }
4480  }
4481  LastValidRow = JointTableSize-1;
4482  Next[LastValidRow] = Last;
4483  //double endIterate = omp_get_wtime();
4484  //printf("Iterate time = %f\n",endIterate-endResize);
4485 }
4486 #endif // USE_OPENMP
4489  Schema NewSchema;
4490  for (TInt c = 0; c < Sch.Len(); c++) {
4491  if (Sch[c].Val1 != GetIdColName()) {
4492  NewSchema.Add(TPair<TStr,TAttrType>(Sch[c].Val1, Sch[c].Val2));
4493  }
4494  }
4495  PTable result = TTable::New(NewSchema, Context);
4496  result->AddTable(*this);
4497  result->UnionAllInPlace(Table);
4498  return result;
4499 }
4501 void TTable::UnionAllInPlace(const TTable& Table) {
4502  AddTable(Table);
4503  // TODO: For the moment, IDs are not initialized (to avoid having too many ID columns)
4504  //result->InitIds();
4505 }
4508 PTable TTable::Union(const TTable& Table) {
4509  Schema NewSchema;
4510  THashSet<TInt> Collisions;
4511  TStrV ColNames;
4513  for (TInt c = 0; c < Sch.Len(); c++) {
4514  if (Sch[c].Val1 != GetIdColName()) {
4515  NewSchema.Add(TPair<TStr,TAttrType>(Sch[c].Val1, Sch[c].Val2));
4516  ColNames.Add(Sch[c].Val1);
4517  }
4518  }
4519  PTable result = TTable::New(NewSchema, Context);
4521  GetCollidingRows(Table, Collisions);
4523  result->AddTable(*this);
4525  result->Unique(ColNames);
4527  // this part should be made faster by adding all the rows in one go
4528  for (TRowIterator it = Table.BegRI(); it < Table.EndRI(); it++) {
4529  if (!Collisions.IsKey(it.GetRowIdx())) {
4530  result->AddRow(it);
4531  }
4532  }
4534  // printf("this: %d %d, table: %d %d, result: %d %d\n",
4535  // this->GetNumRows().Val, this->GetNumValidRows().Val,
4536  // Table.GetNumRows().Val, Table.GetNumValidRows().Val,
4537  // result->GetNumRows().Val, result->GetNumValidRows().Val);
4539  result->InitIds();
4540  return result;
4541 }
4545  Schema NewSchema;
4546  THashSet<TInt> Collisions;
4548  for (TInt c = 0; c < Sch.Len(); c++) {
4549  if (Sch[c].Val1 != GetIdColName()) {
4550  NewSchema.Add(TPair<TStr,TAttrType>(Sch[c].Val1, Sch[c].Val2));
4551  }
4552  }
4553  PTable result = TTable::New(NewSchema, Context);
4555  GetCollidingRows(Table, Collisions);
4557  // this part should be made faster by adding all the rows in one go
4558  for (TRowIterator it = Table.BegRI(); it < Table.EndRI(); it++) {
4559  if (Collisions.IsKey(it.GetRowIdx())) {
4560  result->AddRow(it);
4561  }
4562  }
4563  result->InitIds();
4564  return result;
4565 }
4567 // TTable cannot be const because we will eventually call Table->GroupAux
4568 // as of now, GroupAux cannot be const because it modifies the table in some cases
4570  Schema NewSchema;
4571  THashSet<TInt> Collisions;
4573  for (TInt c = 0; c < Sch.Len(); c++) {
4574  if (Sch[c].Val1 != GetIdColName()) {
4575  NewSchema.Add(TPair<TStr,TAttrType>(Sch[c].Val1, Sch[c].Val2));
4576  }
4577  }
4578  PTable result = TTable::New(NewSchema, Context);
4580  Table.GetCollidingRows(*this, Collisions);
4582  // this part should be made faster by adding all the rows in one go
4583  for (TRowIterator it = BegRI(); it < EndRI(); it++) {
4584  if (!Collisions.IsKey(it.GetRowIdx())) {
4585  result->AddRow(it);
4586  }
4587  }
4588  result->InitIds();
4589  return result;
4590 }
4592 PTable TTable::Project(const TStrV& ProjectCols) {
4593  Schema NewSchema;
4594  for (TInt c = 0; c < ProjectCols.Len(); c++) {
4595  if (!IsColName(ProjectCols[c])) { TExcept::Throw("no such column " + ProjectCols[c]); }
4596  NewSchema.Add(TPair<TStr,TAttrType>(ProjectCols[c], GetColType(ProjectCols[c])));
4597  }
4599  PTable result = TTable::New(NewSchema, Context);
4600  result->AddTable(*this);
4601  result->InitIds();
4602  return result;
4603 }
4605 TBool TTable::IsAttr(const TStr& Attr) {
4606  return IsColName(Attr);
4607 }
4609 TStr TTable::RenumberColName(const TStr& ColName) const {
4610  TStr NColName = ColName;
4611  if (NColName.GetCh(NColName.Len()-2) == '-') {
4612  NColName = NColName.GetSubStr(0,NColName.Len()-3);
4613  }
4614  TInt Conflicts = 0;
4615  for (TInt i = 0; i < Sch.Len(); i++) {
4616  if (NColName == Sch[i].Val1.GetSubStr(0, Sch[i].Val1.Len()-3)) {
4617  Conflicts++;
4618  }
4619  }
4620  Conflicts++;
4621  NColName = NColName + "-" + Conflicts.GetStr();
4622  return NColName;
4623 }
4625 TStr TTable::DenormalizeColName(const TStr& ColName) const {
4626  TStr DColName = ColName;
4627  if (DColName.Len() == 0) { return DColName; }
4628  if (DColName.GetCh(0) == '_') { return DColName; }
4629  if (DColName.GetCh(DColName.Len()-2) == '-') {
4630  DColName = DColName.GetSubStr(0,DColName.Len()-3);
4631  }
4632  TInt Conflicts = 0;
4633  for (TInt i = 0; i < Sch.Len(); i++) {
4634  if (DColName == Sch[i].Val1.GetSubStr(0, Sch[i].Val1.Len()-3)) {
4635  Conflicts++;
4636  }
4637  }
4638  if (Conflicts > 1) { return ColName; }
4639  else { return DColName; }
4640 }
4643  Schema DSch;
4644  for (TInt i = 0; i < Sch.Len(); i++) {
4645  DSch.Add(TPair<TStr, TAttrType>(DenormalizeColName(Sch[i].Val1), Sch[i].Val2));
4646  }
4647  return DSch;
4648 }
4650 void TTable::AddIntCol(const TStr& ColName) {
4651  AddSchemaCol(ColName, atInt);
4653  TInt L = IntCols.Len();
4654  AddColType(ColName, atInt, L-1);
4655 }
4657 void TTable::AddFltCol(const TStr& ColName) {
4658  AddSchemaCol(ColName, atFlt);
4659  FltCols.Add(TFltV(NumRows));
4660  TInt L = FltCols.Len();
4661  AddColType(ColName, atFlt, L-1);
4662 }
4664 void TTable::AddStrCol(const TStr& ColName) {
4665  AddSchemaCol(ColName, atStr);
4667  TInt L = StrColMaps.Len();
4668  AddColType(ColName, atStr, L-1);
4669 }
4671 void TTable::ClassifyAux(const TIntV& SelectedRows, const TStr& LabelName, const TInt& PositiveLabel, const TInt& NegativeLabel) {
4672  AddSchemaCol(LabelName, atInt);
4673  TInt LabelColIdx = IntCols.Len();
4674  AddColType(LabelName, atInt, LabelColIdx);
4676  for (TInt i = 0; i < NumRows; i++) {
4677  IntCols[LabelColIdx][i] = NegativeLabel;
4678  }
4679  for (TInt i = 0; i < SelectedRows.Len(); i++) {
4680  IntCols[LabelColIdx][SelectedRows[i]] = PositiveLabel;
4681  }
4682 }
4684 #ifdef USE_OPENMP
4685 void TTable::ColGenericOpMP(TInt ArgColIdx1, TInt ArgColIdx2, TAttrType ArgType1, TAttrType ArgType2, TInt ResColIdx, TArithOp op){
4686  TAttrType ResType = atFlt;
4687  if(ArgType1 == atInt && ArgType2 == atInt){ ResType = atInt;}
4688  TIntPrV Partitions;
4689  GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
4690  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
4691  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD)
4692  for (int i = 0; i < Partitions.Len(); i++){
4693  TRowIterator RowI(Partitions[i].GetVal1(), this);
4694  TRowIterator EndI(Partitions[i].GetVal2(), this);
4695  while(RowI < EndI){
4696  if(ResType == atInt){
4697  TInt V1 = RowI.GetIntAttr(ArgColIdx1);
4698  TInt V2 = RowI.GetIntAttr(ArgColIdx2);
4699  if (op == aoAdd) { IntCols[ResColIdx][RowI.GetRowIdx()] = V1 + V2; }
4700  if (op == aoSub) { IntCols[ResColIdx][RowI.GetRowIdx()] = V1 - V2; }
4701  if (op == aoMul) { IntCols[ResColIdx][RowI.GetRowIdx()] = V1 * V2; }
4702  if (op == aoDiv) { IntCols[ResColIdx][RowI.GetRowIdx()] = V1 / V2; }
4703  if (op == aoMod) { IntCols[ResColIdx][RowI.GetRowIdx()] = V1 % V2; }
4704  if (op == aoMin) { IntCols[ResColIdx][RowI.GetRowIdx()] = (V1 < V2) ? V1 : V2;}
4705  if (op == aoMax) { IntCols[ResColIdx][RowI.GetRowIdx()] = (V1 > V2) ? V1 : V2;}
4706  } else{
4707  TFlt V1 = (ArgType1 == atInt) ? (TFlt)RowI.GetIntAttr(ArgColIdx1) : RowI.GetFltAttr(ArgColIdx1);
4708  TFlt V2 = (ArgType2 == atInt) ? (TFlt)RowI.GetIntAttr(ArgColIdx2) : RowI.GetFltAttr(ArgColIdx2);
4709  if (op == aoAdd) { FltCols[ResColIdx][RowI.GetRowIdx()] = V1 + V2; }
4710  if (op == aoSub) { FltCols[ResColIdx][RowI.GetRowIdx()] = V1 - V2; }
4711  if (op == aoMul) { FltCols[ResColIdx][RowI.GetRowIdx()] = V1 * V2; }
4712  if (op == aoDiv) { FltCols[ResColIdx][RowI.GetRowIdx()] = V1 / V2; }
4713  if (op == aoMod) { TExcept::Throw("Cannot find modulo for float columns"); }
4714  if (op == aoMin) { FltCols[ResColIdx][RowI.GetRowIdx()] = (V1 < V2) ? V1 : V2;}
4715  if (op == aoMax) { FltCols[ResColIdx][RowI.GetRowIdx()] = (V1 > V2) ? V1 : V2;}
4716  }
4717  RowI++;
4718  }
4719  }
4720 }
4721 #endif // USE_OPENMP
4723 /* Performs generic operations on two numeric attributes
4724  * Operation can be +, -, *, /, %, min or max
4725  * Alternative is to write separate functions for each operation
4726  * Branch prediction may result in as fast performance anyway ?
4727  *
4728  */
4729 void TTable::ColGenericOp(const TStr& Attr1, const TStr& Attr2, const TStr& ResAttr, TArithOp op) {
4730  // check if attributes are valid
4731  if (!IsAttr(Attr1)) TExcept::Throw("No attribute present: " + Attr1);
4732  if (!IsAttr(Attr2)) TExcept::Throw("No attribute present: " + Attr2);
4733  TPair<TAttrType, TInt> Info1 = GetColTypeMap(Attr1);
4734  TPair<TAttrType, TInt> Info2 = GetColTypeMap(Attr2);
4735  TAttrType Arg1Type = Info1.Val1;
4736  TAttrType Arg2Type = Info2.Val1;
4737  if (Arg1Type == atStr || Arg2Type == atStr) {
4738  TExcept::Throw("Only numeric columns supported in arithmetic operations.");
4739  }
4740  if(Arg1Type == atInt && Arg2Type == atFlt && ResAttr == ""){
4741  TExcept::Throw("Trying to write float values to an existing int-typed column");
4742  }
4743  // source column indices
4744  TInt ColIdx1 = Info1.Val2;
4745  TInt ColIdx2 = Info2.Val2;
4747  // destination column index
4748  TInt ColIdx3 = ColIdx1;
4749  // Create empty result column with type that of first attribute
4750  if (ResAttr != "") {
4751  if (Arg1Type == atInt && Arg2Type == atInt) {
4752  AddIntCol(ResAttr);
4753  }
4754  else {
4755  AddFltCol(ResAttr);
4756  }
4757  ColIdx3 = GetColIdx(ResAttr);
4758  }
4759 #ifdef USE_OPENMP
4760  if(GetMP()){
4761  ColGenericOpMP(ColIdx1, ColIdx2, Arg1Type, Arg2Type, ColIdx3, op);
4762  return;
4763  }
4764 #endif //USE_OPENMP
4765  TAttrType ResType = atFlt;
4766  if(Arg1Type == atInt && Arg2Type == atInt){ printf("hooray!\n"); ResType = atInt;}
4767  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
4768  //printf("%d %d %d %d\n", ColIdx1.Val, ColIdx2.Val, ColIdx3.Val, RowI.GetRowIdx().Val);
4769  if(ResType == atInt){
4770  TInt V1 = RowI.GetIntAttr(ColIdx1);
4771  TInt V2 = RowI.GetIntAttr(ColIdx2);
4772  if (op == aoAdd) { IntCols[ColIdx3][RowI.GetRowIdx()] = V1 + V2; }
4773  if (op == aoSub) { IntCols[ColIdx3][RowI.GetRowIdx()] = V1 - V2; }
4774  if (op == aoMul) { IntCols[ColIdx3][RowI.GetRowIdx()] = V1 * V2; }
4775  if (op == aoDiv) { IntCols[ColIdx3][RowI.GetRowIdx()] = V1 / V2; }
4776  if (op == aoMod) { IntCols[ColIdx3][RowI.GetRowIdx()] = V1 % V2; }
4777  if (op == aoMin) { IntCols[ColIdx3][RowI.GetRowIdx()] = (V1 < V2) ? V1 : V2;}
4778  if (op == aoMax) { IntCols[ColIdx3][RowI.GetRowIdx()] = (V1 > V2) ? V1 : V2;}
4779  } else{
4780  TFlt V1 = (Arg1Type == atInt) ? (TFlt)RowI.GetIntAttr(ColIdx1) : RowI.GetFltAttr(ColIdx1);
4781  TFlt V2 = (Arg2Type == atInt) ? (TFlt)RowI.GetIntAttr(ColIdx2) : RowI.GetFltAttr(ColIdx2);
4782  if (op == aoAdd) { FltCols[ColIdx3][RowI.GetRowIdx()] = V1 + V2; }
4783  if (op == aoSub) { FltCols[ColIdx3][RowI.GetRowIdx()] = V1 - V2; }
4784  if (op == aoMul) { FltCols[ColIdx3][RowI.GetRowIdx()] = V1 * V2; }
4785  if (op == aoDiv) { FltCols[ColIdx3][RowI.GetRowIdx()] = V1 / V2; }
4786  if (op == aoMod) { TExcept::Throw("Cannot find modulo for float columns"); }
4787  if (op == aoMin) { FltCols[ColIdx3][RowI.GetRowIdx()] = (V1 < V2) ? V1 : V2;}
4788  if (op == aoMax) { FltCols[ColIdx3][RowI.GetRowIdx()] = (V1 > V2) ? V1 : V2;}
4789  }
4790  }
4791 }
4793 void TTable::ColAdd(const TStr& Attr1, const TStr& Attr2, const TStr& ResultAttrName) {
4794  ColGenericOp(Attr1, Attr2, ResultAttrName, aoAdd);
4795 }
4797 void TTable::ColSub(const TStr& Attr1, const TStr& Attr2, const TStr& ResultAttrName) {
4798  ColGenericOp(Attr1, Attr2, ResultAttrName, aoSub);
4799 }
4801 void TTable::ColMul(const TStr& Attr1, const TStr& Attr2, const TStr& ResultAttrName) {
4802  ColGenericOp(Attr1, Attr2, ResultAttrName, aoMul);
4803 }
4805 void TTable::ColDiv(const TStr& Attr1, const TStr& Attr2, const TStr& ResultAttrName) {
4806  ColGenericOp(Attr1, Attr2, ResultAttrName, aoDiv);
4807 }
4809 void TTable::ColMod(const TStr& Attr1, const TStr& Attr2, const TStr& ResultAttrName) {
4810  ColGenericOp(Attr1, Attr2, ResultAttrName, aoMod);
4811 }
4813 void TTable::ColMin(const TStr& Attr1, const TStr& Attr2, const TStr& ResultAttrName) {
4814  ColGenericOp(Attr1, Attr2, ResultAttrName, aoMin);
4815 }
4817 void TTable::ColMax(const TStr& Attr1, const TStr& Attr2, const TStr& ResultAttrName) {
4818  ColGenericOp(Attr1, Attr2, ResultAttrName, aoMax);
4819 }
4821 void TTable::ColGenericOp(const TStr& Attr1, TTable& Table, const TStr& Attr2, const TStr& ResAttr,
4822  TArithOp op, TBool AddToFirstTable) {
4823  // check if attributes are valid
4824  if (!IsAttr(Attr1)) { TExcept::Throw("No attribute present: " + Attr1); }
4825  if (!Table.IsAttr(Attr2)) { TExcept::Throw("No attribute present: " + Attr2); }
4827  if (NumValidRows != Table.NumValidRows) {
4828  TExcept::Throw("Tables do not have equal number of rows");
4829  }
4831  TPair<TAttrType, TInt> Info1 = GetColTypeMap(Attr1);
4832  TPair<TAttrType, TInt> Info2 = Table.GetColTypeMap(Attr2);
4833  TAttrType Arg1Type = Info1.Val1;
4834  TAttrType Arg2Type = Info2.Val1;
4835  if (Info1.Val1 == atStr || Info2.Val1 == atStr) {
4836  TExcept::Throw("Only numeric columns supported in arithmetic operations.");
4837  }
4838  if(Arg1Type == atInt && Arg2Type == atFlt && ResAttr == ""){
4839  TExcept::Throw("Trying to write float values to an existing int-typed column");
4840  }
4841  // source column indices
4842  TInt ColIdx1 = Info1.Val2;
4843  TInt ColIdx2 = Info2.Val2;
4845  // destination column index
4846  TInt ColIdx3 = AddToFirstTable ? ColIdx1 : ColIdx2;
4848  // Create empty result column in appropriate table with type that of first attribute
4849  if (ResAttr != "") {
4850  if (AddToFirstTable) {
4851  if (Arg1Type == atInt && Arg2Type == atInt) {
4852  AddIntCol(ResAttr);
4853  } else {
4854  AddFltCol(ResAttr);
4855  }
4856  ColIdx3 = GetColIdx(ResAttr);
4857  }
4858  else {
4859  if (Arg1Type == atInt && Arg2Type == atInt) {
4860  Table.AddIntCol(ResAttr);
4861  } else {
4862  Table.AddFltCol(ResAttr);
4863  }
4864  ColIdx3 = Table.GetColIdx(ResAttr);
4865  }
4866  }
4868  /*
4869  #ifdef USE_OPENMP
4870  if(GetMP()){
4871  ColGenericOpMP(Table, AddToFirstTable, ColIdx1, ColIdx2, Arg1Type, Arg2Type, ColIdx3, op);
4872  return;
4873  }
4874  #endif //USE_OPENMP
4875  */
4877  TRowIterator RI1, RI2;
4878  RI1 = BegRI();
4879  RI2 = Table.BegRI();
4880  TAttrType ResType = atFlt;
4881  if(Arg1Type == atInt && Arg2Type == atInt){ ResType = atInt;}
4882  while (RI1 < EndRI() && RI2 < Table.EndRI()) {
4883  if (ResType == atInt) {
4884  TInt V1 = RI1.GetIntAttr(ColIdx1);
4885  TInt V2 = RI2.GetIntAttr(ColIdx2);
4886  if (AddToFirstTable) {
4887  if (op == aoAdd) { IntCols[ColIdx3][RI1.GetRowIdx()] = V1 + V2; }
4888  if (op == aoSub) { IntCols[ColIdx3][RI1.GetRowIdx()] = V1 - V2; }
4889  if (op == aoMul) { IntCols[ColIdx3][RI1.GetRowIdx()] = V1 * V2; }
4890  if (op == aoDiv) { IntCols[ColIdx3][RI1.GetRowIdx()] = V1 / V2; }
4891  if (op == aoMod) { IntCols[ColIdx3][RI1.GetRowIdx()] = V1 % V2; }
4892  }
4893  else {
4894  if (op == aoAdd) { Table.IntCols[ColIdx3][RI2.GetRowIdx()] = V1 + V2; }
4895  if (op == aoSub) { Table.IntCols[ColIdx3][RI2.GetRowIdx()] = V1 - V2; }
4896  if (op == aoMul) { Table.IntCols[ColIdx3][RI2.GetRowIdx()] = V1 * V2; }
4897  if (op == aoDiv) { Table.IntCols[ColIdx3][RI2.GetRowIdx()] = V1 / V2; }
4898  if (op == aoMod) { Table.IntCols[ColIdx3][RI2.GetRowIdx()] = V1 % V2; }
4899  }
4900  } else {
4901  TFlt V1 = (Arg1Type == atInt) ? (TFlt)RI1.GetIntAttr(ColIdx1) : RI2.GetFltAttr(ColIdx1);
4902  TFlt V2 = (Arg2Type == atInt) ? (TFlt)RI1.GetIntAttr(ColIdx2) : RI2.GetFltAttr(ColIdx2);
4903  if (AddToFirstTable) {
4904  if (op == aoAdd) { FltCols[ColIdx3][RI1.GetRowIdx()] = V1 + V2; }
4905  if (op == aoSub) { FltCols[ColIdx3][RI1.GetRowIdx()] = V1 - V2; }
4906  if (op == aoMul) { FltCols[ColIdx3][RI1.GetRowIdx()] = V1 * V2; }
4907  if (op == aoDiv) { FltCols[ColIdx3][RI1.GetRowIdx()] = V1 / V2; }
4908  if (op == aoMod) { TExcept::Throw("Cannot find modulo for float columns"); }
4909  } else {
4910  if (op == aoAdd) { Table.FltCols[ColIdx3][RI2.GetRowIdx()] = V1 + V2; }
4911  if (op == aoSub) { Table.FltCols[ColIdx3][RI2.GetRowIdx()] = V1 - V2; }
4912  if (op == aoMul) { Table.FltCols[ColIdx3][RI2.GetRowIdx()] = V1 * V2; }
4913  if (op == aoDiv) { Table.FltCols[ColIdx3][RI2.GetRowIdx()] = V1 / V2; }
4914  if (op == aoMod) { TExcept::Throw("Cannot find modulo for float columns"); }
4915  }
4916  }
4917  RI1++;
4918  RI2++;
4919  }
4921  if (RI1 != EndRI() || RI2 != Table.EndRI()) {
4922  TExcept::Throw("ColGenericOp: Iteration error");
4923  }
4924 }
4926 void TTable::ColAdd(const TStr& Attr1, TTable& Table, const TStr& Attr2,
4927  const TStr& ResultAttrName, TBool AddToFirstTable) {
4928  ColGenericOp(Attr1, Table, Attr2, ResultAttrName, aoAdd, AddToFirstTable);
4929 }
4931 void TTable::ColSub(const TStr& Attr1, TTable& Table, const TStr& Attr2,
4932  const TStr& ResultAttrName, TBool AddToFirstTable) {
4933  ColGenericOp(Attr1, Table, Attr2, ResultAttrName, aoSub, AddToFirstTable);
4934 }
4936 void TTable::ColMul(const TStr& Attr1, TTable& Table, const TStr& Attr2,
4937  const TStr& ResultAttrName, TBool AddToFirstTable) {
4938  ColGenericOp(Attr1, Table, Attr2, ResultAttrName, aoMul, AddToFirstTable);
4939 }
4941 void TTable::ColDiv(const TStr& Attr1, TTable& Table, const TStr& Attr2,
4942  const TStr& ResultAttrName, TBool AddToFirstTable) {
4943  ColGenericOp(Attr1, Table, Attr2, ResultAttrName, aoDiv, AddToFirstTable);
4944 }
4946 void TTable::ColMod(const TStr& Attr1, TTable& Table, const TStr& Attr2,
4947  const TStr& ResultAttrName, TBool AddToFirstTable) {
4948  ColGenericOp(Attr1, Table, Attr2, ResultAttrName, aoMod, AddToFirstTable);
4949 }
4952 void TTable::ColGenericOp(const TStr& Attr1, const TFlt& Num, const TStr& ResAttr, TArithOp op, const TBool floatCast) {
4953  // check if attribute is valid
4954  if (!IsAttr(Attr1)) { TExcept::Throw("No attribute present: " + Attr1); }
4956  TPair<TAttrType, TInt> Info1 = GetColTypeMap(Attr1);
4957  TAttrType ArgType = Info1.Val1;
4958  if (ArgType == atStr) {
4959  TExcept::Throw("Only numeric columns supported in arithmetic operations.");
4960  }
4961  // source column index
4962  TInt ColIdx1 = Info1.Val2;
4963  // destination column index
4964  TInt ColIdx2 = ColIdx1;
4966  // Create empty result column with type that of first attribute
4967  TBool shouldCast = floatCast;
4968  if (ResAttr != "") {
4969  if ((ArgType == atInt) & !shouldCast) {
4970  AddIntCol(ResAttr);
4971  } else {
4972  AddFltCol(ResAttr);
4973  }
4974  ColIdx2 = GetColIdx(ResAttr);
4975  } else {
4976  // Cannot change type of existing attribute
4977  shouldCast = false;
4978  }
4980  #ifdef USE_OPENMP
4981  if(GetMP()){
4982  ColGenericOpMP(ColIdx1, ColIdx2, ArgType, Num, op, shouldCast);
4983  return;
4984  }
4985  #endif //USE_OPENMP
4987  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
4988  if ((ArgType == atInt) && !shouldCast) {
4989  TInt CurVal = RowI.GetIntAttr(ColIdx1);
4990  TInt Val = static_cast<int>(Num);
4991  if (op == aoAdd) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal + Val; }
4992  if (op == aoSub) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal - Val; }
4993  if (op == aoMul) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal * Val; }
4994  if (op == aoDiv) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal / Val; }
4995  if (op == aoMod) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal % Val; }
4996  }
4997  else {
4998  TFlt CurVal = (ArgType == atFlt) ? RowI.GetFltAttr(ColIdx1) : (TFlt) RowI.GetIntAttr(ColIdx1);
4999  if (op == aoAdd) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal + Num; }
5000  if (op == aoSub) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal - Num; }
5001  if (op == aoMul) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal * Num; }
5002  if (op == aoDiv) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal / Num; }
5003  if (op == aoMod) { TExcept::Throw("Cannot find modulo for float columns"); }
5004  }
5005  }
5006 }
5008 #ifdef USE_OPENMP
5009 void TTable::ColGenericOpMP(const TInt& ColIdx1, const TInt& ColIdx2, TAttrType ArgType, const TFlt& Num, TArithOp op, TBool ShouldCast){
5010  TIntPrV Partitions;
5011  GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
5012  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
5013  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD)
5014  for (int i = 0; i < Partitions.Len(); i++){
5015  TRowIterator RowI(Partitions[i].GetVal1(), this);
5016  TRowIterator EndI(Partitions[i].GetVal2(), this);
5017  while(RowI < EndI){
5018  if ((ArgType == atInt) && !ShouldCast) {
5019  TInt CurVal = RowI.GetIntAttr(ColIdx1);
5020  TInt Val = static_cast<int>(Num);
5021  if (op == aoAdd) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal + Val; }
5022  if (op == aoSub) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal - Val; }
5023  if (op == aoMul) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal * Val; }
5024  if (op == aoDiv) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal / Val; }
5025  if (op == aoMod) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal % Val; }
5026  } else {
5027  TFlt CurVal = (ArgType == atFlt) ? RowI.GetFltAttr(ColIdx1) : (TFlt) RowI.GetIntAttr(ColIdx1);
5028  if (op == aoAdd) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal + Num; }
5029  if (op == aoSub) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal - Num; }
5030  if (op == aoMul) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal * Num; }
5031  if (op == aoDiv) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal / Num; }
5032  if (op == aoMod) { TExcept::Throw("Cannot find modulo for float columns"); }
5033  }
5034  RowI++;
5035  }
5036  }
5037 }
5038 #endif
5040 void TTable::ColAdd(const TStr& Attr1, const TFlt& Num, const TStr& ResultAttrName, const TBool floatCast) {
5041  ColGenericOp(Attr1, Num, ResultAttrName, aoAdd, floatCast);
5042 }
5044 void TTable::ColSub(const TStr& Attr1, const TFlt& Num, const TStr& ResultAttrName, const TBool floatCast) {
5045  ColGenericOp(Attr1, Num, ResultAttrName, aoSub, floatCast);
5046 }
5048 void TTable::ColMul(const TStr& Attr1, const TFlt& Num, const TStr& ResultAttrName, const TBool floatCast) {
5049  ColGenericOp(Attr1, Num, ResultAttrName, aoMul, floatCast);
5050 }
5052 void TTable::ColDiv(const TStr& Attr1, const TFlt& Num, const TStr& ResultAttrName, const TBool floatCast) {
5053  ColGenericOp(Attr1, Num, ResultAttrName, aoDiv, floatCast);
5054 }
5056 void TTable::ColMod(const TStr& Attr1, const TFlt& Num, const TStr& ResultAttrName, const TBool floatCast) {
5057  ColGenericOp(Attr1, Num, ResultAttrName, aoMod, floatCast);
5058 }
5060 void TTable::ColConcat(const TStr& Attr1, const TStr& Attr2, const TStr& Sep, const TStr& ResAttr) {
5061  // check if attributes are valid
5062  if (!IsAttr(Attr1)) TExcept::Throw("No attribute present: " + Attr1);
5063  if (!IsAttr(Attr2)) TExcept::Throw("No attribute present: " + Attr2);
5065  TPair<TAttrType, TInt> Info1 = GetColTypeMap(Attr1);
5066  TPair<TAttrType, TInt> Info2 = GetColTypeMap(Attr2);
5068  if (Info1.Val1 != atStr || Info2.Val1 != atStr) {
5069  TExcept::Throw("Only string columns supported in concat.");
5070  }
5072  // source column indices
5073  TInt ColIdx1 = Info1.Val2;
5074  TInt ColIdx2 = Info2.Val2;
5076  // destination column index
5077  TInt ColIdx3 = ColIdx1;
5079  // Create empty result column with type that of first attribute
5080  if (ResAttr != "") {
5081  AddStrCol(ResAttr);
5082  ColIdx3 = GetColIdx(ResAttr);
5083  }
5085  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5086  TStr CurVal1 = RowI.GetStrAttr(ColIdx1);
5087  TStr CurVal2 = RowI.GetStrAttr(ColIdx2);
5088  TStr NewVal = CurVal1 + Sep + CurVal2;
5089  TInt Key = TInt(Context->StringVals.AddKey(NewVal));
5090  StrColMaps[ColIdx3][RowI.GetRowIdx()] = Key;
5091  }
5092 }
5094 void TTable::ColConcat(const TStr& Attr1, TTable& Table, const TStr& Attr2, const TStr& Sep,
5095  const TStr& ResAttr, TBool AddToFirstTable) {
5096  // check if attributes are valid
5097  if (!IsAttr(Attr1)) { TExcept::Throw("No attribute present: " + Attr1); }
5098  if (!Table.IsAttr(Attr2)) { TExcept::Throw("No attribute present: " + Attr2); }
5100  if (NumValidRows != Table.NumValidRows) {
5101  TExcept::Throw("Tables do not have equal number of rows");
5102  }
5104  TPair<TAttrType, TInt> Info1 = GetColTypeMap(Attr1);
5105  TPair<TAttrType, TInt> Info2 = Table.GetColTypeMap(Attr2);
5107  if (Info1.Val1 != atStr || Info2.Val1 != atStr) {
5108  TExcept::Throw("Only string columns supported in concat.");
5109  }
5111  // source column indices
5112  TInt ColIdx1 = Info1.Val2;
5113  TInt ColIdx2 = Info2.Val2;
5115  // destination column index
5116  TInt ColIdx3 = ColIdx1;
5118  if (!AddToFirstTable) {
5119  ColIdx3 = ColIdx2;
5120  }
5122  // Create empty result column in appropriate table with type that of first attribute
5123  if (ResAttr != "") {
5124  if (AddToFirstTable) {
5125  AddStrCol(ResAttr);
5126  ColIdx3 = GetColIdx(ResAttr);
5127  }
5128  else {
5129  Table.AddStrCol(ResAttr);
5130  ColIdx3 = Table.GetColIdx(ResAttr);
5131  }
5132  }
5134  TRowIterator RI1, RI2;
5136  RI1 = BegRI();
5137  RI2 = Table.BegRI();
5139  while (RI1 < EndRI() && RI2 < Table.EndRI()) {
5140  TStr CurVal1 = RI1.GetStrAttr(ColIdx1);
5141  TStr CurVal2 = RI2.GetStrAttr(ColIdx2);
5142  TStr NewVal = CurVal1 + Sep + CurVal2;
5143  TInt Key = TInt(Context->StringVals.AddKey(NewVal));
5144  if (AddToFirstTable) {
5145  StrColMaps[ColIdx3][RI1.GetRowIdx()] = Key;
5146  }
5147  else {
5148  Table.StrColMaps[ColIdx3][RI2.GetRowIdx()] = Key;
5149  }
5150  RI1++;
5151  RI2++;
5152  }
5154  if (RI1 != EndRI() || RI2 != Table.EndRI()) {
5155  TExcept::Throw("ColGenericOp: Iteration error");
5156  }
5157 }
5159 void TTable::ColConcatConst(const TStr& Attr1, const TStr& Val, const TStr& Sep, const TStr& ResAttr) {
5160  // check if attribute is valid
5161  if (!IsAttr(Attr1)) { TExcept::Throw("No attribute present: " + Attr1); }
5163  TPair<TAttrType, TInt> Info1 = GetColTypeMap(Attr1);
5165  if (Info1.Val1 != atStr) {
5166  TExcept::Throw("Only string columns supported in concat.");
5167  }
5169  // source column index
5170  TInt ColIdx1 = Info1.Val2;
5172  // destination column index
5173  TInt ColIdx2 = ColIdx1;
5175  // Create empty result column with type that of first attribute
5176  if (ResAttr != "") {
5177  AddStrCol(ResAttr);
5178  ColIdx2 = GetColIdx(ResAttr);
5179  }
5181  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5182  TStr CurVal = RowI.GetStrAttr(ColIdx1);
5183  TStr NewVal = CurVal + Sep + Val;
5184  TInt Key = TInt(Context->StringVals.AddKey(NewVal));
5185  StrColMaps[ColIdx2][RowI.GetRowIdx()] = Key;
5186  }
5187 }
5189 void TTable::ReadIntCol(const TStr& ColName, TIntV& Result) const{
5190  if (!IsColName(ColName)) { TExcept::Throw("no such column " + ColName); }
5191  if (GetColType(ColName) != atInt) { TExcept::Throw("not an integer column " + ColName); }
5192  TInt ColId = GetColIdx(ColName);
5193  for (TRowIterator it = BegRI(); it < EndRI(); it++) {
5194  Result.Add(it.GetIntAttr(ColId));
5195  }
5196 }
5198 void TTable::ReadFltCol(const TStr& ColName, TFltV& Result) const{
5199  if (!IsColName(ColName)) { TExcept::Throw("no such column " + ColName); }
5200  if (GetColType(ColName) != atFlt) { TExcept::Throw("not a floating point column " + ColName); }
5201  TInt ColId = GetColIdx(ColName);
5202  for (TRowIterator it = BegRI(); it < EndRI(); it++) {
5203  Result.Add(it.GetFltAttr(ColId));
5204  }
5205 }
5207 void TTable::ReadStrCol(const TStr& ColName, TStrV& Result) const{
5208  if (!IsColName(ColName)) { TExcept::Throw("no such column " + ColName); }
5209  if (GetColType(ColName) != atStr) { TExcept::Throw("not a string column " + ColName); }
5210  TInt ColId = GetColIdx(ColName);
5211  for (TRowIterator it = BegRI(); it < EndRI(); it++) {
5212  Result.Add(it.GetStrAttr(ColId));
5213  }
5214 }
5216 void TTable::ProjectInPlace(const TStrV& ProjectCols) {
5217  TStrV NProjectCols = NormalizeColNameV(ProjectCols);
5218  for (TInt c = 0; c < NProjectCols.Len(); c++) {
5219  if (!IsColName(NProjectCols[c])) { TExcept::Throw("no such column " + NProjectCols[c]); }
5220  }
5221  THashSet<TStr> ProjectColsSet = THashSet<TStr>(NProjectCols);
5222  // Delete the column vectors
5223  for (TInt i = Sch.Len() - 1; i >= 0; i--) {
5224  TStr ColName = GetSchemaColName(i);
5225  if (ProjectColsSet.IsKey(ColName) || ColName == IdColName) { continue; }
5226  TAttrType ColType = GetSchemaColType(i);
5227  TInt ColId = GetColIdx(ColName);
5228  switch (ColType) {
5229  case atInt:
5230  IntCols.Del(ColId);
5231  break;
5232  case atFlt:
5233  FltCols.Del(ColId);
5234  break;
5235  case atStr:
5236  StrColMaps.Del(ColId);
5237  break;
5238  }
5239  }
5241  // Rebuild the ColTypeMap with new indexes of the column vectors
5242  TInt IntColCnt = 0;
5243  TInt FltColCnt = 0;
5244  TInt StrColCnt = 0;
5245  ColTypeMap.Clr();
5246  for (TInt i = 0; i < Sch.Len(); i++) {
5247  TStr ColName = GetSchemaColName(i);
5248  if (!ProjectColsSet.IsKey(ColName) && ColName != IdColName) { continue; }
5249  TAttrType ColType = GetSchemaColType(i);
5250  switch (ColType) {
5251  case atInt:
5252  AddColType(ColName, atInt, IntColCnt);
5253  IntColCnt++;
5254  break;
5255  case atFlt:
5256  AddColType(ColName, atFlt, FltColCnt);
5257  FltColCnt++;
5258  break;
5259  case atStr:
5260  AddColType(ColName, atStr, StrColCnt);
5261  StrColCnt++;
5262  break;
5263  }
5264  }
5266  // Update schema
5267  for (TInt i = Sch.Len() - 1; i >= 0; i--) {
5268  TStr ColName = GetSchemaColName(i);
5269  if (ProjectColsSet.IsKey(ColName) || ColName == IdColName) { continue; }
5270  Sch.Del(i);
5271  }
5272 }
5274 TInt TTable::CompareKeyVal(const TInt& K1, const TInt& V1, const TInt& K2, const TInt& V2) {
5275  // if (K1 == K2) {
5276  // if (V1 < V2) { return -1; }
5277  // else if (V1 > V2) { return 1; }
5278  // else return 0;
5279  // }
5280  // if (K1 < K2) { return -1; }
5281  // else { return 1; }
5283  if (K1 == K2) { return V1 - V2; }
5284  else { return K1 - K2; }
5285 }
5288  TInt j;
5289  for (j = Start; j < End; j++) {
5290  if (CompareKeyVal(Key[j], Val[j], Key[j+1], Val[j+1]) > 0) {
5291  break;
5292  }
5293  }
5294  if (j >= End) { return 0; }
5295  else { return 1; }
5296 }
5298 void TTable::ISortKeyVal(TIntV& Key, TIntV& Val, TInt Start, TInt End) {
5299  if (Start < End) {
5300  for (TInt i = Start+1; i <= End; i++) {
5301  TInt K = Key[i];
5302  TInt V = Val[i];
5303  TInt j = i;
5304  while ((Start < j) && (CompareKeyVal(Key[j-1], Val[j-1], K, V) > 0)) {
5305  Key[j] = Key[j-1];
5306  Val[j] = Val[j-1];
5307  j--;
5308  }
5309  Key[j] = K;
5310  Val[j] = V;
5311  }
5312  }
5313 }
5315 TInt TTable::GetPivotKeyVal(TIntV& Key, TIntV& Val, TInt Start, TInt End) {
5316  TInt L = End - Start + 1;
5317  const TInt Idx1 = Start + TInt::GetRnd(L);
5318  const TInt Idx2 = Start + TInt::GetRnd(L);
5319  const TInt Idx3 = Start + TInt::GetRnd(L);
5320  if (CompareKeyVal(Key[Idx1], Val[Idx1], Key[Idx2], Val[Idx2]) < 0) {
5321  if (CompareKeyVal(Key[Idx2], Val[Idx2], Key[Idx3], Val[Idx3]) < 0) { return Idx2; }
5322  if (CompareKeyVal(Key[Idx1], Val[Idx1], Key[Idx3], Val[Idx3]) < 0) { return Idx3; }
5323  return Idx1;
5324  } else {
5325  if (CompareKeyVal(Key[Idx3], Val[Idx3], Key[Idx2], Val[Idx2]) < 0) { return Idx2; }
5326  if (CompareKeyVal(Key[Idx3], Val[Idx3], Key[Idx1], Val[Idx1]) < 0) { return Idx3; }
5327  return Idx1;
5328  }
5329 }
5333  TInt Pivot = GetPivotKeyVal(Key, Val, Start, End);
5334  //printf("Pivot=%d\n", Pivot.Val);
5335  TInt PivotKey = Key[Pivot];
5336  TInt PivotVal = Val[Pivot];
5337  Key.Swap(Pivot, End);
5338  Val.Swap(Pivot, End);
5339  TInt StoreIdx = Start;
5340  for (TInt i = Start; i < End; i++) {
5341  //printf("%d %d %d %d\n", Key[i].Val, Val[i].Val, PivotKey.Val, PivotVal.Val);
5342  if (CompareKeyVal(Key[i], Val[i], PivotKey, PivotVal) <= 0) {
5343  Key.Swap(i, StoreIdx);
5344  Val.Swap(i, StoreIdx);
5345  StoreIdx++;
5346  }
5347  }
5348  //printf("StoreIdx=%d\n", StoreIdx.Val);
5349  // move pivot value to its place
5350  Key.Swap(StoreIdx, End);
5351  Val.Swap(StoreIdx, End);
5352  return StoreIdx;
5353 }
5355 void TTable::QSortKeyVal(TIntV& Key, TIntV& Val, TInt Start, TInt End) {
5356  //printf("Thread=%d, Start=%d, End=%d\n", omp_get_thread_num(), Start.Val, End.Val);
5357  TInt L = End-Start;
5358  if (L <= 0) { return; }
5359  if (CheckSortedKeyVal(Key, Val, Start, End) == 0) { return; }
5361  if (L <= 20) { ISortKeyVal(Key, Val, Start, End); }
5362  else {
5363  TInt Pivot = PartitionKeyVal(Key, Val, Start, End);
5365  if (Pivot > End) { return; }
5366  if (L <= 500000) {
5367  QSortKeyVal(Key, Val, Start, Pivot-1);
5368  QSortKeyVal(Key, Val, Pivot+1, End);
5369  } else {
5370 #ifdef USE_OPENMP
5371 #ifndef GLib_WIN32
5372  #pragma omp task untied shared(Key, Val)
5373 #endif
5374 #endif
5375  { QSortKeyVal(Key, Val, Start, Pivot-1); }
5377 #ifdef USE_OPENMP
5378 #ifndef GLib_WIN32
5379  #pragma omp task untied shared(Key, Val)
5380 #endif
5381 #endif
5382  { QSortKeyVal(Key, Val, Pivot+1, End); }
5383  }
5384  }
5385 }
5387 TIntV TTable::GetIntRowIdxByVal(const TStr& ColName, const TInt& Val) const {
5389  if (IntColIndexes.IsKey(ColName)) {
5390  THash<TInt, TIntV> ColIndex = IntColIndexes.GetDat(ColName);
5391  if (ColIndex.IsKey(Val)) {
5392  return ColIndex.GetDat(Val);
5393  }
5394  else {
5395  TIntV Empty;
5396  return Empty;
5397  }
5398  }
5399  TIntV ToReturn;
5400  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5401  TInt ValAtRow = RowI.GetIntAttr(ColName);
5402  if ( Val == ValAtRow) {
5403  ToReturn.Add(RowI.GetRowIdx());
5404  }
5405  }
5406  return ToReturn;
5407 }
5408 TIntV TTable::GetStrRowIdxByMap(const TStr& ColName, const TInt& Map) const {
5410  if (StrMapColIndexes.IsKey(ColName)) {
5411  THash<TInt, TIntV> ColIndex = StrMapColIndexes.GetDat(ColName);
5412  if (ColIndex.IsKey(Map)) {
5413  return ColIndex.GetDat(Map);
5414  }
5415  else {
5416  TIntV Empty;
5417  return Empty;
5418  }
5419  }
5420  TIntV ToReturn;
5421  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5422  TInt MapAtRow = RowI.GetStrMapByName(ColName);
5423  if ( Map == MapAtRow) {
5424  ToReturn.Add(RowI.GetRowIdx());
5425  }
5426  }
5427  return ToReturn;
5428 }
5430 TIntV TTable::GetFltRowIdxByVal(const TStr& ColName, const TFlt& Val) const {
5432  if (FltColIndexes.IsKey(ColName)) {
5433  THash<TFlt, TIntV> ColIndex = FltColIndexes.GetDat(ColName);
5434  if (ColIndex.IsKey(Val)) {
5435  return ColIndex.GetDat(Val);
5436  }
5437  else {
5438  TIntV Empty;
5439  return Empty;
5440  }
5441  }
5443  TIntV ToReturn;
5444  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5445  TFlt ValAtRow = RowI.GetFltAttr(ColName);
5446  if ( Val == ValAtRow) {
5447  ToReturn.Add(RowI.GetRowIdx());
5448  }
5449  }
5450  return ToReturn;
5451 }
5455  THash<TInt, TIntV> NewIndex;
5456  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5457  TInt ValAtRow = RowI.GetIntAttr(ColName);
5458  TInt RowIdx = RowI.GetRowIdx();
5459  if (NewIndex.IsKey(ValAtRow)) {
5460  TIntV Curr_V = NewIndex.GetDat(ValAtRow);
5461  Curr_V.Add(RowIdx);
5462  }
5463  else {
5464  TIntV New_V;
5465  New_V.Add(RowIdx);
5466  NewIndex.AddDat(ValAtRow, New_V);
5467  }
5468  }
5469  IntColIndexes.AddDat(ColName, NewIndex);
5470  return 0;
5471 }
5474  THash<TFlt, TIntV> NewIndex;
5475  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5476  TFlt ValAtRow = RowI.GetFltAttr(ColName);
5477  TInt RowIdx = RowI.GetRowIdx();
5478  if (NewIndex.IsKey(ValAtRow)) {
5479  TIntV Curr_V = NewIndex.GetDat(ValAtRow);
5480  Curr_V.Add(RowIdx);
5481  }
5482  else {
5483  TIntV New_V;
5484  New_V.Add(RowIdx);
5485  NewIndex.AddDat(ValAtRow, New_V);
5486  }
5487  }
5488  FltColIndexes.AddDat(ColName, NewIndex);
5489  return 0;
5490 }
5492  THash<TInt, TIntV> NewIndex;
5493  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5494  TInt MapAtRow = RowI.GetStrMapByName(ColName);
5495  TInt RowIdx = RowI.GetRowIdx();
5496  if (NewIndex.IsKey(MapAtRow)) {
5497  TIntV Curr_V = NewIndex.GetDat(MapAtRow);
5498  Curr_V.Add(RowIdx);
5499  }
5500  else {
5501  TIntV New_V;
5502  New_V.Add(RowIdx);
5503  NewIndex.AddDat(MapAtRow, New_V);
5504  }
5505  }
5506  StrMapColIndexes.AddDat(ColName, NewIndex);
5507  return 0;
5508 }
Definition: table.h:268
TSize GetMemUsedKB()
Returns approximate memory used by table in [KB].
Definition: table.cpp:3918
void ThresholdJoinInputCorrectness(const TStr &KeyCol1, const TStr &JoinCol1, const TTable &Table, const TStr &KeyCol2, const TStr &JoinCol2)
Definition: table.cpp:2458
void AddSchemaCol(const TStr &ColName, TAttrType ColType)
Adds column with name ColName and type ColType to the schema.
Definition: table.h:652
TFlt GetFltAttr(TInt ColIdx) const
Returns value of floating point attribute specified by float column index for current row...
Definition: table.cpp:159
TPair< TInt, TInt > TIntPr
Definition: ds.h:83
TInt RequestIndexInt(const TStr &ColName)
Creates Index for Int Column ColName.
Definition: table.cpp:5453
Definition: table.h:268
TBool IsLastGraphOfSequence()
Checks if the end of the graph sequence is reached.
Definition: table.cpp:3663
TBool IsAttr(const TStr &Attr)
Checks if Attr is an attribute of this table schema.
Definition: table.cpp:4605
void SetFltVal(TStr VarName, TFlt VarVal)
Set flt variable value in the predicate or all the children that use it.
Definition: table.h:100
void Order(const TStrV &OrderBy, TStr OrderColName="", TBool ResetRankByMSC=false, TBool Asc=true)
Orders the rows according to the values in columns of OrderBy (in descending lexicographic order)...
Definition: table.cpp:3220
void FillBucketsByInterval(TStr SplitAttr, TIntPrV SplitIntervals)
Fills RowIdBuckets with sets of row ids.
Definition: table.cpp:3577
bool Next()
Loads next line from the input file.
Definition: ssmp.cpp:17
TIter EndI() const
Returns an iterator referring to the past-the-end element in the vector.
Definition: ds.h:567
void RemoveRow(TInt RowIdx, TInt PrevRowIdx)
Removes row with id RowIdx.
Definition: table.cpp:1115
int Reserved() const
Definition: hash.h:771
Definition: table.h:268
TStrV EdgeAttrV
List of columns (attributes) to serve as edge attributes.
Definition: table.h:601
TStr GetStr() const
Definition: dt.h:1107
THash< GroupStmt, THash< TGroupKey, TIntV > > GroupMapping
Maps grouping statements to their (group-by key –> group id) mapping.
Definition: table.h:591
TInt FirstValidRow
Physical index of first valid row.
Definition: table.h:563
TStr DenormalizeColName(const TStr &ColName) const
Removes suffix to column name if exists.
Definition: table.cpp:4625
int Len() const
Definition: dt.h:487
void GetDatV(TVec< TDat > &DatV) const
Definition: hash.h:450
TInt GetPivot(TIntV &V, TInt StartIdx, TInt EndIdx, const TVec< TAttrType > &SortByTypes, const TIntV &SortByIndices, TBool Asc)
Gets pivot element for QSort.
Definition: table.cpp:3090
TInt GetColIdx(const TStr &ColName) const
Gets index of column ColName among columns of the same type in the schema.
Definition: table.h:1004
enum TAttrType_ TAttrType
Types for tables, sparse and dense attributes.
TVec< uint64 > GetStartPosV(uint64 Lb, uint64 Ub) const
Finds start positions of all lines ending somewhere in [Lb, Ub)
Definition: ssmp.cpp:106
void StoreGroupCol(const TStr &GroupColName, const TVec< TPair< TInt, TInt > > &GroupAndRowIds)
Parallel helper function for grouping. - we currently don't support such parallel grouping by complex...
Definition: table.cpp:1290
static const TInt Last
Special value for Next vector entry - last row in table.
Definition: table.h:497
PTable UnionAll(const TTable &Table)
Returns union of this table with given Table, preserving duplicates.
Definition: table.cpp:4488
::TSize GetMemUsed() const
Definition: hash.h:794
static TInt PartitionKeyVal(TIntV &Key, TIntV &Val, TInt Start, TInt End)
Definition: table.cpp:5332
Primitive class: Wrapper around primitive data types.
Definition: table.h:220
bool operator==(const TRowIterator &RowI) const
Checks if this iterator points to the same row pointed by RowI.
Definition: table.cpp:147
TStrV GetSrcNodeIntAttrV() const
Gets src node int attribute name vector.
Definition: table.cpp:985
void PrintGrouping(const THash< TGroupKey, TIntV > &Grouping) const
Definition: table.cpp:1768
Schema Sch
Table Schema.
Definition: table.h:559
void SelectFirstNRows(const TInt &N)
Selects first N rows from the table.
Definition: table.cpp:3337
TStrV GetDstNodeStrAttrV() const
Gets dst node str attribute name vector.
Definition: table.cpp:1062
Definition: ds.h:129
void Del(const TSizeTy &ValN)
Removes the element at position ValN.
Definition: ds.h:1130
void GetPartitionRanges(TIntPrV &Partitions, TInt NumPartitions) const
Partitions the table into NumPartitions and populate Partitions with the ranges.
Definition: table.cpp:1157
TInt GetIntAttr(TInt ColIdx) const
Returns value of integer attribute specified by integer column index for current row.
Definition: table.cpp:155
Comparison operators for selection predicates.
Definition: table.h:7
int Val
Definition: dt.h:1046
void Defrag()
Releases memory of deleted rows, and defrags.
Definition: table.cpp:3291
PNEANet ToVarGraphSequenceIterator(TStr SplitAttr, TAttrAggr AggrPolicy, TIntPrV SplitIntervals)
Creates the graph sequence one at a time.
Definition: table.cpp:3649
void SaveBin(const TStr &OutFNm)
Saves table schema and content to a binary file.
Definition: table.cpp:829
TStr GetStrAttr(TInt ColIdx) const
Returns value of string attribute specified by string column index for current row.
Definition: table.cpp:163
void Save(TSOut &SOut) const
Definition: dt.h:1060
void AddIntCol(const TStr &ColName)
Adds an integer column with name ColName.
Definition: table.cpp:4650
THash< TStr, TPair< TAttrType, TInt > > ColTypeMap
Definition: table.h:574
TStr Rvar
Right variable of the comparison op.
Definition: table.h:21
static const int Mx
Definition: dt.h:1049
Definition: table.h:266
void ThresholdJoinCountCollisions(const TTable &TB, const TTable &TS, const TIntIntVH &T, TInt JoinColIdxB, TInt KeyColIdxB, TInt KeyColIdxS, THash< TIntPr, TIntTr > &Counters, TBool ThisIsSmaller, TAttrType JoinColType, TAttrType KeyType)
Definition: table.cpp:2486
void AddGraphAttributeV(TStrV &Attrs, TBool IsEdge, TBool IsSrc, TBool IsDst)
Adds vector of names of columns to be used as graph attributes.
Definition: table.cpp:972
void GroupByIntColMP(const TStr &GroupBy, THashMP< TInt, TIntV > &Grouping, TBool UsePhysicalIds=true) const
Groups/hashes by a single column with integer values, using OpenMP multi-threading.
Definition: table.cpp:1205
void SetFltColToConstMP(TInt UpdateColIdx, TFlt DefaultFltVal)
Definition: table.cpp:4129
int GetFlds() const
Returns the number of fields in the current line.
Definition: ssmp.h:51
const TVal1 & GetVal1() const
Definition: ds.h:60
void ThresholdJoinCountPerJoinKeyCollisions(const TTable &TB, const TTable &TS, const TIntIntVH &T, TInt JoinColIdxB, TInt KeyColIdxB, TInt KeyColIdxS, THash< TIntTr, TIntTr > &Counters, TBool ThisIsSmaller, TAttrType JoinColType, TAttrType KeyType)
Definition: table.cpp:2537
uint64 GetStreamPos() const
Returns position of stream pointer.
Definition: ssmp.h:89
TIter BegI() const
Definition: hash.h:171
void ColAdd(const TStr &Attr1, const TStr &Attr2, const TStr &ResultAttrName="")
Performs columnwise addition. See TTable::ColGenericOp.
Definition: table.cpp:4793
Possible column-wise arithmetic operations.
Definition: table.h:268
TInt RequestIndexStrMap(const TStr &ColName)
Creates Index for Str Column ColName.
Definition: table.cpp:5491
double Val
Definition: dt.h:1295
Definition: fl.h:319
TFlt GetNextFltAttr(TInt ColIdx) const
Returns value of float attribute specified by float column index for next row.
Definition: table.cpp:252
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
Definition: table.h:268
void AddSelectedRows(const TTable &Table, const TIntV &RowIDs)
Adds rows from Table that correspond to ids in RowIDs.
Definition: table.cpp:4376
int Len() const
Definition: hash.h:770
TStr IdColName
A mapping from column name to column type and column index among columns of the same type...
Definition: table.h:575
Predicate - encapsulates comparison operations.
Definition: table.h:82
TBool CompareAtomicConstTStr(TInt ColIdx, const TStr &Val, TPredComp Cmp)
Compares value in column ColIdx with given TStr Val.
Definition: table.cpp:208
PTable SelfSimJoinPerGroup(const TStr &GroupAttr, const TStr &SimCol, const TStr &DistanceColName, const TSimType &SimType, const TFlt &Threshold)
Performs join if the distance between two rows is less than the specified threshold.
Definition: table.cpp:2074
static TStrV NormalizeColNameV(const TStrV &Cols)
Adds suffix to column name if it doesn't exist.
Definition: table.h:549
static TInt CompareKeyVal(const TInt &K1, const TInt &V1, const TInt &K2, const TInt &V2)
Definition: table.cpp:5274
THash< TStr, THash< TInt, TIntV > > StrMapColIndexes
Indexes for String Columns.
Definition: table.h:579
THash< TStr, THash< TInt, TIntV > > IntColIndexes
Indexes for Int Columns.
Definition: table.h:578
void ColConcat(const TStr &Attr1, const TStr &Attr2, const TStr &Sep="", const TStr &ResAttr="")
Concatenates two string columns.
Definition: table.cpp:5060
void Save(TSOut &SOut) const
Definition: hash.h:141
TStrV GetSrcNodeStrAttrV() const
Gets src node str attribute name vector.
Definition: table.cpp:1051
TTableContext * Context
Execution Context.
Definition: table.h:555
Distance metrics for similarity joins.
Definition: table.h:149
TBool Start
A flag indicating whether the current row in the first valid row of the table.
Definition: table.h:386
void QSort(TIntV &V, TInt StartIdx, TInt EndIdx, const TVec< TAttrType > &SortByTypes, const TIntV &SortByIndices, TBool Asc=true)
Performs QSort on given vector V.
Definition: table.cpp:3134
TAttrType Type
Type of the predicate variables.
Definition: table.h:17
TPredicateNode * Left
Left child of this node.
Definition: table.h:57
THash< TStr, TInt > IntVars
Int variables in the current predicate tree.
Definition: table.h:84
Definition: ss.h:72
void InvalidateAffectedGroupings(const TStr &Attr)
Definition: table.cpp:1561
void Dump(FILE *OutF=stdout) const
Prints table contents to a text file.
Definition: table.cpp:867
TInt LastValidRow
Physical index of last valid row.
Definition: table.h:564
void Group(const TStrV &GroupBy, const TStr &GroupColName, TBool Ordered=true, TBool UsePhysicalIds=true)
Groups rows depending on values of GroupBy columns.
Definition: table.cpp:1549
TStr GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:811
void ResizeTable(int RowCount)
Resizes the table to hold RowCount rows.
Definition: table.cpp:4307
void PrintContextSize()
Definition: table.cpp:3937
static TInt GetMP()
Definition: table.h:537
Possible policies for aggregating node attributes.
Definition: table.h:266
void ColDiv(const TStr &Attr1, const TStr &Attr2, const TStr &ResultAttrName="")
Performs columnwise division. See TTable::ColGenericOp.
Definition: table.cpp:4805
void Rename(const TStr &Column, const TStr &NewLabel)
Renames a column.
Definition: table.cpp:1085
void GroupAux(const TStrV &GroupBy, THash< TGroupKey, TPair< TInt, TIntV > > &Grouping, TBool Ordered, const TStr &GroupColName, TBool KeepUnique, TIntV &UniqueVec, TBool UsePhysicalIds=true)
Helper function for grouping.
Definition: table.cpp:1302
const TVal2 & GetVal2() const
Definition: ds.h:61
TVal1 Val1
Definition: ds.h:131
TStrV GetEdgeFltAttrV() const
Gets edge float attribute name vector.
Definition: table.cpp:1040
Definition: table.h:149
bool GetInt(const int &FldN, int &Val) const
If the field FldN is an integer its value is returned in Val and the function returns true...
Definition: ss.cpp:447
TStr GetNextStrAttr(TInt ColIdx) const
Returns value of string attribute specified by string column index for next row.
Definition: table.cpp:256
Execution context.
Definition: table.h:194
const TDat & GetDat(const TKey &Key) const
Definition: hash.h:220
Node iterator. Only forward iteration (operator++) is supported.
Definition: network.h:1632
void GetStrAttrNames(TStrV &Names) const
Gets vector of str attribute names.
Definition: network.h:1740
TIter EndI() const
Definition: hash.h:176
void Clr()
Definition: bd.h:502
Schema GetSchema()
Gets the schema of this table.
Definition: table.h:1116
TVec< TIntV > RowIdBuckets
Partitioning of row ids into buckets corresponding to different graph objects when generating a seque...
Definition: table.h:609
TRowIteratorWithRemove BegRIWR()
Gets iterator with reomve to the first valid row.
Definition: table.h:1236
TInt GetNumValidRows() const
Gets number of valid, i.e. not deleted, rows in this table.
Definition: table.h:1225
TRowIterator BegRI() const
Gets iterator to the first valid row of the table.
Definition: table.h:1232
int GetFlds() const
Returns the number of fields in the current line.
Definition: ss.h:116
PNEANet ToGraphPerGroupIterator(TStr GroupAttr, TAttrAggr AggrPolicy)
Creates the graph sequence one at a time.
Definition: table.cpp:3654
TVec< TIntV > IntCols
Next[i] is the successor of row i. Table iterators follow the order dictated by Next ...
Definition: table.h:568
Iterator class for TTable rows, that allows logical row removal while iterating.
Definition: table.h:383
TSizeTy GetMemUsed() const
Returns the memory footprint (the number of bytes) of the vector.
Definition: ds.h:483
void CheckAndAddIntNode(PNEANet Graph, THashSet< TInt > &NodeVals, TInt NodeId)
Checks if given NodeId is seen earlier; if not, add it to Graph and hashmap NodeVals.
Definition: table.cpp:3368
TVec< PNEANet > ToGraphSequence(TStr SplitAttr, TAttrAggr AggrPolicy, TInt WindowSize, TInt JumpSize, TInt StartVal=TInt::Mn, TInt EndVal=TInt::Mx)
Creates a sequence of graphs based on values of column SplitAttr and windows specified by JumpSize an...
Definition: table.cpp:3629
void GroupByFltCol(const TStr &GroupBy, T &Grouping, const TIntV &IndexSet, TBool All, TBool UsePhysicalIds=true) const
Groups/hashes by a single column with float values. Returns hash table with grouping.
Definition: table.h:1633
TInt GetStrMapByName(const TStr &Col) const
Returns integer mapping of string attribute specified by attribute name for current row...
Definition: table.cpp:181
PTable Minus(TTable &Table)
Returns table with rows that are present in this table but not in given Table.
Definition: table.cpp:4569
bool IsKey(const TKey &Key) const
Definition: shash.h:1148
static PTable GetNodeTable(const PNEANet &Network, TTableContext *Context)
Extracts node TTable from PNEANet.
Definition: table.cpp:3667
THash< TStr, TStr > StrVars
String variables in the current predicate tree.
Definition: table.h:86
TIntV GetStrRowIdxByMap(const TStr &ColName, const TInt &Map) const
Gets the rows containing int mapping Map in str column ColName.
Definition: table.cpp:5408
int GetId() const
Returns edge ID.
Definition: network.h:1722
TStr GetIdColName() const
Gets name of the id column of this table.
Definition: table.h:646
static TBool EvalStrAtom(const TStr &Val1, const TStr &Val2, TPredComp Cmp)
Compare atomic string values Val1 and Val2 using predicate Cmp.
Definition: table.h:123
Definition: gbase.h:23
Default constructor.
Definition: table.h:389
static void LoadSSSeq(PTable &NewTable, const Schema &S, const TStr &InFNm, const TIntV &RelevantCols, const char &Separator, TBool HasTitleLine)
Sequentially loads data from input file at InFNm into NewTable.
Definition: table.cpp:649
Definition: table.h:7
Definition: dt.h:1293
Definition: fl.h:58
void Save(TSOut &SOut) const
Definition: ds.h:903
void IncrementNext()
Increments the next vector and set last, NumRows and NumValidRows.
Definition: table.cpp:2235
PTable SimJoin(const TStrV &Cols1, const TTable &Table, const TStrV &Cols2, const TStr &DistanceColName, const TSimType &SimType, const TFlt &Threshold)
Performs join if the distance between two rows is less than the specified threshold.
Definition: table.cpp:1974
bool Empty() const
Tests whether the vector is empty.
Definition: ds.h:542
void InitIds()
Adds explicit row ids, initialize hash set mapping ids to physical rows.
Definition: table.cpp:1863
TStrTrV CommonNodeAttrs
List of attribute pairs with values common to source and destination and their common given name...
Definition: table.h:604
void QSortPar(TIntV &V, const TVec< TAttrType > &SortByTypes, const TIntV &SortByIndices, TBool Asc=true)
Performs QSort in parallel on given vector V.
Definition: table.cpp:3186
void Save(TSOut &SOut)
Saves table schema and content to a binary format.
Definition: table.cpp:834
int GetDstNId() const
Returns the destination of the edge.
Definition: network.h:1726
void Swap(TVec< TVal, TSizeTy > &Vec)
Swaps the contents of the vector with Vec.
Definition: ds.h:1047
TBool Result
Result of evaulating the predicate rooted at this node.
Definition: table.h:54
void ReadFltCol(const TStr &ColName, TFltV &Result) const
Reads values of entire float column into Result.
Definition: table.cpp:5198
void InvalidatePhysicalGroupings()
Definition: table.cpp:1557
TIter EndI() const
Definition: hashmp.h:156
void SkipCommentLines()
Skips lines that begin with a comment character.
Definition: ssmp.cpp:12
TPair< TIntV, TFltV > TGroupKey
Represents grouping key with IntV for integer and string attributes and FltV for float attributes...
Definition: table.h:145
Iterator class for TTable rows.
Definition: table.h:339
TInt GetNextRowIdx() const
Gets physical index of next row.
Definition: table.cpp:243
TVal2 Val2
Definition: ds.h:132
int GetId() const
Returns ID of the current node.
Definition: network.h:1647
static const int Mn
Definition: dt.h:1048
bool Eof() const
Checks for end of file.
Definition: ss.h:122
void Aggregate(const TStrV &GroupByAttrs, TAttrAggr AggOp, const TStr &ValAttr, const TStr &ResAttr, TBool Ordered=true)
Aggregates values of ValAttr after grouping with respect to GroupByAttrs. Result are stored as new at...
Definition: table.cpp:1565
TAttrType GetSchemaColType(TInt Idx) const
Gets type of the column with index Idx in the schema.
Definition: table.h:650
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
void SetIntVal(TStr VarName, TInt VarVal)
Set int variable value in the predicate or all the children that use it.
Definition: table.h:98
TStrV GetEdgeIntAttrV() const
Gets edge int attribute name vector.
Definition: table.cpp:1007
Definition: table.h:149
void SetStrVal(TStr VarName, TStr VarVal)
Set str variable value in the predicate or all the children that use it.
Definition: table.h:102
void RemoveNext()
Removes next row.
Definition: table.cpp:278
TStr StrConst
Str const value if this object is a string constant.
Definition: table.h:24
TVec< PNEANet > ToGraphPerGroup(TStr GroupAttr, TAttrAggr AggrPolicy)
Creates a sequence of graphs based on grouping specified by GroupAttr.
Definition: table.cpp:3640
const TTable * Table
Reference to table containing this row.
Definition: table.h:341
static void Throw(const TStr &MsgStr)
Definition: ut.h:187
Schema DenormalizeSchema() const
Removes suffix to column names in the Schema.
Definition: table.cpp:4642
PNEANet NextGraphIterator()
Calls to this must be preceded by a call to one of the above ToGraph*Iterator functions.
Definition: table.cpp:3659
void PutAll(const TVal &Val)
Sets all elements of the vector to value Val.
Definition: ds.h:1166
unsigned long long uint64
Definition: bd.h:38
PNEANet BuildGraph(const TIntV &RowIds, TAttrAggr AggrPolicy)
Makes a single pass over the rows in the given row id set, and creates nodes, edges, assigns node and edge attributes.
Definition: table.cpp:3425
TBool EvalAtomicPredicate(const TAtomicPredicate &Atom)
Evaluate the give atomic predicate.
Definition: table.cpp:102
void ColSub(const TStr &Attr1, const TStr &Attr2, const TStr &ResultAttrName="")
Performs columnwise subtraction. See TTable::ColGenericOp.
Definition: table.cpp:4797
int GetSrcNId() const
Gets the source node of an edge.
Definition: graphmp.h:116
const TVal & GetDat(const TVal &Val) const
Returns reference to the first occurrence of element Val.
Definition: ds.h:807
int GetEmptyRowsStart(int NewRows)
Gets the start index to a chunk of empty rows of size NewRows.
Definition: table.cpp:4353
void PrintSize()
Definition: table.cpp:3908
THash< TStr, THash< TFlt, TIntV > > FltColIndexes
Indexes for Float Columns.
Definition: table.h:580
TStr Lvar
Left variable of the comparison op.
Definition: table.h:20
const char * GetKey(const int &KeyId) const
Definition: hash.h:821
void ProjectInPlace(const TStrV &ProjectCols)
Keeps only the columns specified in ProjectCols.
Definition: table.cpp:5216
TStr GetStr() const
Definition: table.h:237
TBool CompareAtomicConst(TInt ColIdx, const TPrimitive &Val, TPredComp Cmp)
Compares value in column ColIdx with given primitive Val.
Definition: table.cpp:190
size_t TSize
Definition: bd.h:58
#define Assert(Cond)
Definition: bd.h:251
void Reindex()
Reinitializes row ids.
Definition: table.cpp:1869
TInt CurrBucket
Current row id bucket - used when generating a sequence of graphs using an iterator.
Definition: table.h:610
PTable IsNextK(const TStr &OrderCol, TInt K, const TStr &GroupBy, const TStr &RankColName="")
Distance based filter.
Definition: table.cpp:3869
TAttrType GetColType(const TStr &ColName) const
Gets type of column ColName.
Definition: table.h:1218
TVec< TIntV > StrColMaps
Data columns of integer mappings of string attributes.
Definition: table.h:570
int sync_bool_compare_and_swap(int *lock)
Definition: table.cpp:4147
TRowIteratorWithRemove & Next()
Increments the iterator (For Python compatibility).
Definition: table.cpp:222
PNEANet ToGraphSequenceIterator(TStr SplitAttr, TAttrAggr AggrPolicy, TInt WindowSize, TInt JumpSize, TInt StartVal=TInt::Mn, TInt EndVal=TInt::Mx)
Creates the graph sequence one at a time.
Definition: table.cpp:3644
Definition: table.h:149
int GetDstNId() const
Gets destination node of an edge.
Definition: graphmp.h:118
int AddKey(const TKey &Key)
Definition: shash.h:1254
::TSize GetMemUsed() const
Definition: hash.h:159
void GroupByIntCol(const TStr &GroupBy, T &Grouping, const TIntV &IndexSet, TBool All, TBool UsePhysicalIds=true) const
Groups/hashes by a single column with integer values.
Definition: table.h:1605
PTable Join(const TStr &Col1, const TTable &Table, const TStr &Col2)
Performs equijoin.
Definition: table.cpp:2252
bool IsKey(const TKey &Key) const
Definition: hashmp.h:191
static void LoadSSPar(PTable &NewTable, const Schema &S, const TStr &InFNm, const TIntV &RelevantCols, const char &Separator, TBool HasTitleLine)
Parallelly loads data from input file at InFNm into NewTable. Only work when NewTable has no string c...
Definition: table.cpp:487
TIntV GetIntRowIdxByVal(const TStr &ColName, const TInt &Val) const
Gets the rows containing Val in int column ColName.
Definition: table.cpp:5387
TInt GetRowIdx() const
Gets the id of the row pointed by this iterator.
Definition: table.cpp:151
bool GetFlt(const int &FldN, double &Val) const
If the field FldN is a float its value is returned in Val and the function returns true...
Definition: ss.cpp:485
A class representing a cached grouping statement identifier.
Definition: table.h:275
TStr GetSchemaColName(TInt Idx) const
Gets name of the column with index Idx in the schema.
Definition: table.h:648
int GetSrcNId() const
Returns the source of the edge.
Definition: network.h:1724
TInt GetStrMapById(TInt ColIdx) const
Returns integer mapping of a string attribute value specified by string column index for current row...
Definition: table.cpp:186
TStrV SrcNodeAttrV
List of columns (attributes) to serve as source node attributes.
Definition: table.h:602
TAttrAggr AggrPolicy
Aggregation policy used for solving conflicts between different values of an attribute of the same no...
Definition: table.h:611
static void QSortKeyVal(TIntV &Key, TIntV &Val, TInt Start, TInt End)
Definition: table.cpp:5355
void Select(TPredicate &Predicate, TIntV &SelectedRows, TBool Remove=true)
Selects rows that satisfy given Predicate.
Definition: table.cpp:2730
void UnionAllInPlace(const TTable &Table)
Same as TTable::ConcatTable.
Definition: table.cpp:4501
TInt GetInt() const
Definition: table.h:235
char GetCh(const int &ChN) const
Definition: dt.h:483
TIntIntH RowIdMap
Mapping of permanent row ids to physical id.
Definition: table.h:576
void SaveSS(const TStr &OutFNm)
Saves table schema and content to a TSV file.
Definition: table.cpp:780
PTable Union(const TTable &Table)
Returns union of this table with given Table.
Definition: table.cpp:4508
void SelectAtomicConst(const TStr &Col, const TPrimitive &Val, TPredComp Cmp, TIntV &SelectedRows, PTable &SelectedTable, TBool Remove=true, TBool Table=true)
Selects rows where the value of Col matches given primitive Val.
Definition: table.cpp:2853
Definition: table.h:5
void UpdateFltFromTable(const TStr &KeyAttr, const TStr &UpdateAttr, const TTable &Table, const TStr &FKeyAttr, const TStr &ReadAttr, TFlt DefaultFltVal=0.0)
Definition: table.cpp:4219
Edge iterator. Only forward iteration (operator++) is supported.
Definition: graphmp.h:99
void ColConcatConst(const TStr &Attr1, const TStr &Val, const TStr &Sep="", const TStr &ResAttr="")
Concatenates column values with given string value.
Definition: table.cpp:5159
Definition: fl.h:128
void GetCollidingRows(const TTable &T, THashSet< TInt > &Collisions)
Gets set of row ids of rows common with table T.
Definition: table.cpp:3991
void AddGraphAttribute(const TStr &Attr, TBool IsEdge, TBool IsSrc, TBool IsDst)
Adds names of columns to be used as graph attributes.
Definition: table.cpp:965
void KeepSortedRows(const TIntV &KeepV)
Removes all rows that are not mentioned in the SORTED vector KeepV.
Definition: table.cpp:1132
TPair< TAttrType, TInt > GetColTypeMap(const TStr &ColName) const
Gets column type and index of ColName.
Definition: table.h:676
TAttrType GetType() const
Definition: table.h:238
THash< TInt, TInt > TIntH
Definition: hash.h:565
void GroupingSanityCheck(const TStr &GroupBy, const TAttrType &AttrType) const
Checks if grouping key exists and matches given attr type.
Definition: table.cpp:1195
void GetFltAttrNames(TStrV &Names) const
Gets vector of flt attribute names.
Definition: network.h:1744
TStrHash< TInt, TBigStrPool > StringVals
StringPool - stores string data values and maps them to integers.
Definition: table.h:196
void UpdateTableForNewRow()
Updates table state after adding one or more rows.
Definition: table.cpp:4117
void SetVal(const TSizeTy &ValN, const TVal &Val)
Sets the value of element at position ValN to Val.
Definition: ds.h:625
int AddKey(const char *Key)
Definition: hash.h:896
static TInt UseMP
Global switch for choosing multi-threaded versions of TTable functions.
Definition: table.h:500
virtual void Flush()=0
TPredComp Compare
Comparison op represented by this node.
Definition: table.h:19
void DelColType(const TStr &ColName)
Adds column with name ColName and type ColType to the ColTypeMap.
Definition: table.h:671
Definition: dt.h:1044
void ReadIntCol(const TStr &ColName, TIntV &Result) const
Reads values of entire int column into Result.
Definition: table.cpp:5189
void FillBucketsByWindow(TStr SplitAttr, TInt JumpSize, TInt WindowSize, TInt StartVal, TInt EndVal)
Fills RowIdBuckets with sets of row ids.
Definition: table.cpp:3527
static TStr NormalizeColName(const TStr &ColName)
Adds suffix to column name if it doesn't exist.
Definition: table.h:540
void AddStrCol(const TStr &ColName)
Adds a string column with name ColName.
Definition: table.cpp:4664
THash< TStr, GroupStmt > GroupStmtNames
Maps user-given grouping statement names to their group-by attributes.
Definition: table.h:583
TRowIterator & Next()
Increments the iterator (For Python compatibility).
Definition: table.cpp:135
TStr SrcCol
Column (attribute) to serve as src nodes when constructing the graph.
Definition: table.h:599
void GetIntAttrNames(TStrV &Names) const
Gets vector of int attribute names.
Definition: network.h:1689
void ISort(const TSizeTy &MnLValN, const TSizeTy &MxRValN, const bool &Asc)
Insertion sorts the values between positions MnLValN...MxLValN.
Definition: ds.h:1184
PTable Project(const TStrV &ProjectCols)
Returns table with only the columns in ProjectCols.
Definition: table.cpp:4592
void StoreStrCol(const TStr &ColName, const TStrV &ColVals)
Adds entire str column to table.
Definition: table.cpp:4098
TPredicateNode * Right
Definition: table.h:58
TVec< TFltV > FltCols
Data columns of floating point attributes.
Definition: table.h:569
TVec< TStr > TStrV
Definition: ds.h:1534
TStrV GetDstNodeFltAttrV() const
Gets dst node float attribute name vector.
Definition: table.cpp:1029
TStrV DstNodeAttrV
List of columns (attributes) to serve as destination node attributes.
Definition: table.h:603
uint64 CountNewLinesInRange(uint64 Lb, uint64 Ub) const
Counts number of occurences of ' ' in [Lb, Ub)
Definition: ssmp.cpp:102
Edge iterator. Only forward iteration (operator++) is supported.
Definition: network.h:1707
TIntV Next
A vector describing the logical order of the rows.
Definition: table.h:565
static int GetRnd(const int &Range=0)
Definition: dt.h:1085
Definition: ds.h:32
void Gen(const int &ExpectVals)
Definition: hashmp.h:160
int AddKey(const TKey &Key)
Definition: hash.h:331
TRowIterator EndRI() const
Gets iterator to the last valid row of the table.
Definition: table.h:1234
void AddStrVal(const TInt &ColIdx, const TStr &Val)
Adds Val in column with id ColIdx.
Definition: table.cpp:951
TTable * Table
Reference to table containing this row.
Definition: table.h:385
int GetIntFromFldV(TVec< char * > &FieldsV, const int &FldN)
Gets integer at field FldN.
Definition: ssmp.cpp:152
void AddRow(const TRowIterator &RI)
Adds row corresponding to RI.
Definition: table.cpp:4272
void NextFromIndex(uint64 Index, TVec< char * > &FieldsV)
Loads next line starting from a given position.
Definition: ssmp.cpp:128
TInt NumRows
Number of rows in the table (valid and invalid).
Definition: table.h:561
TFlt GetFltVal(const TStr &ColName, const TInt &RowIdx)
Gets the value of float attribute ColName at row RowIdx.
Definition: table.h:1015
static PTable LoadSS(const Schema &S, const TStr &InFNm, TTableContext *Context, const char &Separator= '\t', TBool HasTitleLine=false)
Loads table from spread sheet (TSV, CSV, etc). Note: HasTitleLine = true is not supported. Please comment title lines instead.
Definition: table.cpp:775
TVec< TFlt > TFltV
Definition: ds.h:1531
void Unique(const TStr &Col)
Removes rows with duplicate values in given column.
Definition: table.cpp:1246
TRowIteratorWithRemove & operator++(int)
Increments the iterator.
Definition: table.cpp:218
void AddJointRow(const TTable &T1, const TTable &T2, TInt RowIdx1, TInt RowIdx2)
Adds joint row T1[RowIdx1]<=>T2[RowIdx2].
Definition: table.cpp:1937
void Classify(TPredicate &Predicate, const TStr &LabelName, const TInt &PositiveLabel=1, const TInt &NegativeLabel=0)
Definition: table.cpp:2785
void Merge(TIntV &V, TInt Idx1, TInt Idx2, TInt Idx3, const TVec< TAttrType > &SortByTypes, const TIntV &SortByIndices, TBool Asc=true)
Helper function for parallel QSort.
Definition: table.cpp:3158
TStr DstCol
Column (attribute) to serve as dst nodes when constructing the graph.
Definition: table.h:600
TIter BegI() const
Definition: hashmp.h:153
void ReadStrCol(const TStr &ColName, TStrV &Result) const
Reads values of entire string column into Result.
Definition: table.cpp:5207
TStr GetStrVal(TInt ColIdx, TInt RowIdx) const
Gets the value in column with id ColIdx at row RowIdx.
Definition: table.h:636
long long int64
Definition: bd.h:27
void GetKeyV(TVec< TKey > &KeyV) const
Definition: hash.h:442
static PTable GetEdgeTable(const PNEANet &Network, TTableContext *Context)
Extracts edge TTable from PNEANet.
Definition: table.cpp:3719
static const TInt Invalid
Special value for Next vector entry - logically removed row.
Definition: table.h:498
void AddColType(const TStr &ColName, TPair< TAttrType, TInt > ColType)
Adds column with name ColName and type ColType to the ColTypeMap.
Definition: table.h:661
Definition: dt.h:412
PNEANet GetNextGraphFromSequence()
Returns the next graph in sequence corresponding to RowIdBuckets.
Definition: table.cpp:3612
bool Empty() const
Definition: dt.h:488
TBool CompareAtomicConst(TInt ColIdx, const TPrimitive &Val, TPredComp Cmp)
Compares value in column ColIdx with given primitive Val.
Definition: table.cpp:282
void StoreFltCol(const TStr &ColName, const TFltV &ColVals)
Adds entire flt column to table.
Definition: table.cpp:4081
THash< GroupStmt, THash< TInt, TGroupKey > > GroupIDMapping
Maps grouping statements to their (group id –> group-by key) mapping.
Definition: table.h:587
TInt IntConst
Int const value if this object is an integer constant.
Definition: table.h:22
TIter BegI() const
Returns an iterator pointing to the first element in the vector.
Definition: ds.h:565
TPredOp Op
Logical op represented by this node.
Definition: table.h:53
void GroupByStrCol(const TStr &GroupBy, T &Grouping, const TIntV &IndexSet, TBool All, TBool UsePhysicalIds=true) const
Groups/hashes by a single column with string values. Returns hash table with grouping.
Definition: table.h:1660
TTableContext * ChangeContext(TTableContext *Context)
Changes the current context. Moves all object items to the new context.
Definition: table.cpp:901
Definition: hash.h:88
TInt CurrRowIdx
Physical row index of current row pointer by iterator.
Definition: table.h:384
TPredicateNode * Root
Rood node of the current predicate tree.
Definition: table.h:87
Definition: gbase.h:23
Definition: table.h:268
void AggregateCols(const TStrV &AggrAttrs, TAttrAggr AggOp, const TStr &ResAttr)
Aggregates attributes in AggrAttrs across columns.
Definition: table.cpp:1730
bool operator==(const TRowIteratorWithRemove &RowI) const
Checks if this iterator points to the same row pointed by RowI.
Definition: table.cpp:235
Table class: Relational table with columnar data storage.
Definition: table.h:495
bool operator<(const TRowIterator &RowI) const
Checks if this iterator points to a row that is before the one pointed by RowI.
Definition: table.cpp:141
void SetStreamPos(uint64 Pos)
Sets position of stream pointer.
Definition: ssmp.h:97
void UpdateFltFromTableMP(const TStr &KeyAttr, const TStr &UpdateAttr, const TTable &Table, const TStr &FKeyAttr, const TStr &ReadAttr, TFlt DefaultFltVal=0.0)
Definition: table.cpp:4151
static PTable GetEdgeTablePN(const PNGraphMP &Network, TTableContext *Context)
Extracts edge TTable from parallel graph PNGraphMP.
Definition: table.cpp:3777
void ISort(TIntV &V, TInt StartIdx, TInt EndIdx, const TVec< TAttrType > &SortByTypes, const TIntV &SortByIndices, TBool Asc=true)
Performs insertion sort on given vector V.
Definition: table.cpp:3076
TInt GetRowIdx() const
Gets physical index of current row.
Definition: table.cpp:239
TInt RequestIndexFlt(const TStr &ColName)
Creates Index for Flt Column ColName.
Definition: table.cpp:5472
static TBool EvalAtom(T Val1, T Val2, TPredComp Cmp)
Compare atomic values Val1 and Val2 using predicate Cmp.
Definition: table.h:110
bool operator<(const TRowIteratorWithRemove &RowI) const
Checks if this iterator points to a row that is before the one pointed by RowI.
Definition: table.cpp:229
void InitRowIdBuckets(int NumBuckets)
Initializes the RowIdBuckets vector which will be used for the graph sequence creation.
Definition: table.cpp:3515
TStrV GetSrcNodeFltAttrV() const
Gets src node float attribute name vector.
Definition: table.cpp:1018
static PTable GetFltNodePropertyTable(const PNEANet &Network, const TIntFltH &Property, const TStr &NodeAttrName, const TAttrType &NodeAttrType, const TStr &PropertyAttrName, TTableContext *Context)
Extracts node and edge property TTables from THash.
Definition: table.cpp:3830
Hash-Table with multiprocessing support.
Definition: hashmp.h:81
PTable ThresholdJoinPerJoinKeyOutputTable(const THash< TIntTr, TIntTr > &Counters, TInt Threshold, const TTable &Table)
Definition: table.cpp:2602
TVal1 Val1
Definition: ds.h:34
PTable ThresholdJoin(const TStr &KeyCol1, const TStr &JoinCol1, const TTable &Table, const TStr &KeyCol2, const TStr &JoinCol2, TInt Threshold, TBool PerJoinKey=false)
Definition: table.cpp:2624
static void ISortKeyVal(TIntV &Key, TIntV &Val, TInt Start, TInt End)
Definition: table.cpp:5298
TBool IsConst
Flag if this atomic node represents a constant value.
Definition: table.h:18
TInt CurrRowIdx
Physical row index of current row pointed by iterator.
Definition: table.h:340
TVal2 Val2
Definition: ds.h:35
TVec< TInt > TIntV
Definition: ds.h:1529
static TInt GetPivotKeyVal(TIntV &Key, TIntV &Val, TInt Start, TInt End)
Definition: table.cpp:5315
void Clr(const bool &DoDel=true, const int &NoDelLim=-1, const bool &ResetDat=true)
Definition: hash.h:319
Definition: table.h:7
bool Next()
Loads next line from the input file.
Definition: ss.cpp:412
Definition: bd.h:196
TInt IsNextDirty
Flag to signify whether the rows are stored in logical sequence or reordered. Used for optimizing Get...
Definition: table.h:613
TStrV GetEdgeStrAttrV() const
Gets edge str attribute name vector.
Definition: table.cpp:1074
Definition: table.h:5
void AddFltCol(const TStr &ColName)
Adds a float column with name ColName.
Definition: table.cpp:4657
TInt CompareRows(TInt R1, TInt R2, const TAttrType &CompareByType, const TInt &CompareByIndex, TBool Asc=true)
Returns positive value if R1 is bigger, negative value if R2 is bigger, and 0 if they are equal (strc...
Definition: table.cpp:3044
TStr RenumberColName(const TStr &ColName) const
Returns a re-numbered column name based on number of existing columns with conflicting names...
Definition: table.cpp:4609
TTriple< TInt, TInt, TInt > TIntTr
Definition: ds.h:170
TInt NumValidRows
Number of valid rows in the table (i.e. rows that were not logically removed).
Definition: table.h:562
Definition: table.cpp:302
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:495
PTable ThresholdJoinOutputTable(const THash< TIntPr, TIntTr > &Counters, TInt Threshold, const TTable &Table)
Definition: table.cpp:2588
void Count(const TStr &CountColName, const TStr &Col)
Counts number of unique elements.
Definition: table.cpp:1782
PTable InitializeJointTable(const TTable &Table)
Initializes an empty table for the join of this table with the given table.
Definition: table.cpp:1896
void ColMax(const TStr &Attr1, const TStr &Attr2, const TStr &ResultAttrName="")
Performs max of two columns. See TTable::ColGenericOp.
Definition: table.cpp:4817
void Reserve(const TSizeTy &_MxVals)
Reserves enough memory for the vector to store _MxVals elements.
Definition: ds.h:515
void ClassifyAtomic(const TStr &Col1, const TStr &Col2, TPredComp Cmp, const TStr &LabelName, const TInt &PositiveLabel=1, const TInt &NegativeLabel=0)
Definition: table.cpp:2846
bool Cmp(const int &RelOp, const TRec &Rec1, const TRec &Rec2)
Definition: bd.h:426
void StoreIntCol(const TStr &ColName, const TIntV &ColVals)
Adds entire int column to table.
Definition: table.cpp:4064
void AddIdColumn(const TStr &IdColName)
Adds a column of explicit integer identifiers to the rows.
Definition: table.cpp:1880
void GetVariables(TStrV &Variables)
Get variables in the predicate tree rooted at this node.
Definition: table.cpp:1
static TInt CheckSortedKeyVal(TIntV &Key, TIntV &Val, TInt Start, TInt End)
Definition: table.cpp:5287
void AddEdgeAttributes(PNEANet &Graph, int RowId)
Adds attributes of edge corresponding to RowId to the Graph.
Definition: table.cpp:3375
void GetIntAttrNames(TStrV &Names) const
Gets vector of int attribute names.
Definition: network.h:1732
Definition: table.h:5
Definition: gbase.h:23
TVec< PNEANet > ToVarGraphSequence(TStr SplitAttr, TAttrAggr AggrPolicy, TIntPrV SplitIntervals)
Creates a sequence of graphs based on values of column SplitAttr and intervals specified by SplitInte...
Definition: table.cpp:3635
char * CStr()
Definition: dt.h:476
TInt GetNextIntAttr(TInt ColIdx) const
Returns value of integer attribute specified by integer column index for next row.
Definition: table.cpp:248
void ColGenericOp(const TStr &Attr1, const TStr &Attr2, const TStr &ResAttr, TArithOp op)
Performs columnwise arithmetic operation.
Definition: table.cpp:4729
void SelectAtomic(const TStr &Col1, const TStr &Col2, TPredComp Cmp, TIntV &SelectedRows, TBool Remove=true)
Selects rows using atomic compare operation.
Definition: table.cpp:2793
TRowIterator & operator++(int)
Increments the iterator.
Definition: table.cpp:131
bool IsKey(const TKey &Key) const
Definition: hash.h:216
void GetVariables(TStrV &Variables)
Get variables in current predicate.
Definition: table.cpp:10
bool IsInt(const int &FldN) const
Checks whether fields FldN is an integer.
Definition: ss.h:143
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
void ColMin(const TStr &Attr1, const TStr &Attr2, const TStr &ResultAttrName="")
Performs min of two columns. See TTable::ColGenericOp.
Definition: table.cpp:4813
Definition: dt.h:881
void ColMod(const TStr &Attr1, const TStr &Attr2, const TStr &ResultAttrName="")
Performs columnwise modulus. See TTable::ColGenericOp.
Definition: table.cpp:4809
static PNEANet New()
Static cons returns pointer to graph. Ex: PNEANet Graph=TNEANet::New().
Definition: network.h:1940
void GetFltAttrNames(TStrV &Names) const
Gets vector of flt attribute names.
Definition: network.h:1701
void RemoveFirstRow()
Removes first valid row of the table.
Definition: table.cpp:1102
bool IsStrIn(const TStr &Str) const
Definition: dt.h:554
TBool IsFirst() const
Checks whether iterator points to first valid row of the table.
Definition: table.cpp:274
void Trunc(const TSizeTy &_Vals=-1)
Truncates the vector's length and capacity to _Vals elements.
Definition: ds.h:982
Atomic predicate - encapsulates comparison operations.
Definition: table.h:15
TBool IsColName(const TStr &ColName) const
Definition: table.h:656
Definition: table.h:268
TInt CheckAndAddFltNode(T Graph, THash< TFlt, TInt > &NodeVals, TFlt FNodeVal)
Checks if given NodeVal is seen earlier; if not, add it to Graph and hashmap NodeVals.
Definition: table.h:1540
TFlt GetFlt() const
Definition: table.h:236
Predicate node - represents a binary predicate operation on two predicate nodes.
Definition: table.h:51
int Len() const
Definition: hash.h:186
static PTable New()
Definition: table.h:931
void AddNodeAttributes(TInt NId, TStrV NodeAttrV, TInt RowId, THash< TInt, TStrIntVH > &NodeIntAttrs, THash< TInt, TStrFltVH > &NodeFltAttrs, THash< TInt, TStrStrVH > &NodeStrAttrs)
Takes as parameters, and updates, maps NodeXAttrs: Node Id –> (attribute name –> Vector of attribut...
Definition: table.cpp:3394
void GetStrAttrNames(TStrV &Names) const
Gets vector of str attribute names.
Definition: network.h:1697
PNEANet GetFirstGraphFromSequence(TAttrAggr AggrPolicy)
Returns the first graph of the sequence.
Definition: table.cpp:3606
TDat & AddDat(const TKey &Key)
Definition: hash.h:196
PTable Intersection(const TTable &Table)
Returns intersection of this table with given Table.
Definition: table.cpp:4544
void AddNJointRowsMP(const TTable &T1, const TTable &T2, const TVec< TIntPrV > &JointRowIDSet)
Adds rows from T1 and T2 to this table in a parallel manner. Used by Join.
Definition: table.cpp:4419
const TDat & GetDat(const TKey &Key) const
Definition: hashmp.h:195
TFlt FltConst
Flt const value if this object is a float constant.
Definition: table.h:23
TBool Eval()
Return the result of evaluating current predicate.
Definition: table.cpp:14
TIntV GetFltRowIdxByVal(const TStr &ColName, const TFlt &Val) const
Gets the rows containing Val in flt column ColName.
Definition: table.cpp:5430
Definition: table.h:268
TSize GetContextMemUsedKB()
Returns approximate memory used by table context in [KB].
Definition: table.cpp:3946
uint64 GetStreamLen() const
Returns length of stream.
Definition: ssmp.h:93
TPredicateNode * Parent
Parent node of this node.
Definition: table.h:56
const TKey & GetKey(const int &KeyId) const
Definition: hash.h:210
TInt GetIntVal(const TStr &ColName, const TInt &RowIdx)
Gets the value of integer attribute ColName at row RowIdx.
Definition: table.h:1011
void AddTable(const TTable &T)
Adds all the rows of the input table. Allows duplicate rows (not a union).
Definition: table.cpp:3952
bool IsCmt() const
Checks whether the current line is a comment (starts with '#').
Definition: ss.h:120
void ColMul(const TStr &Attr1, const TStr &Attr2, const TStr &ResultAttrName="")
Performs columnwise multiplication. See TTable::ColGenericOp.
Definition: table.cpp:4801
TVal3 Val3
Definition: ds.h:133
void ClassifyAux(const TIntV &SelectedRows, const TStr &LabelName, const TInt &PositiveLabel=1, const TInt &NegativeLabel=0)
Adds a label attribute with positive labels on selected rows and negative labels on the rest...
Definition: table.cpp:4671
THash< TStr, TFlt > FltVars
Float variables in the current predicate tree.
Definition: table.h:85
void AddNRows(int NewRows, const TVec< TIntV > &IntColsP, const TVec< TFltV > &FltColsP, const TVec< TIntV > &StrColMapsP)
Adds NewRows rows from the given vectors for each column type.
Definition: table.cpp:4398
TVec< PTable > SpliceByGroup(const TStrV &GroupByAttrs, TBool Ordered=true)
Splices table into subtables according to a grouping statement.
Definition: table.cpp:1788
Definition: table.h:266
int GetKeyId(const char *Key) const
Definition: hash.h:922
Definition: table.h:5
void ColGenericOpMP(TInt ArgColIdx1, TInt ArgColIdx2, TAttrType ArgType1, TAttrType ArgType2, TInt ResColIdx, TArithOp op)
Definition: table.cpp:4685
TVec< PNEANet > GetGraphsFromSequence(TAttrAggr AggrPolicy)
Returns a sequence of graphs.
Definition: table.cpp:3594
TStrV GetDstNodeIntAttrV() const
Gets dst node int attribute name vector.
Definition: table.cpp:996
TAtomicPredicate Atom
Atomic predicate at this node.
Definition: table.h:55
bool IsFlt(const int &FldN) const
Checks whether fields FldN is a float.
Definition: ss.h:148
TSizeTy AddV(const TVec< TVal, TSizeTy > &ValV)
Adds the elements of the vector ValV to the to end of the vector.
Definition: ds.h:1056
TInt Partition(TIntV &V, TInt StartIdx, TInt EndIdx, const TVec< TAttrType > &SortByTypes, const TIntV &SortByIndices, TBool Asc)
Partitions vector for QSort.
Definition: table.cpp:3106
double GetFltFromFldV(TVec< char * > &FieldsV, const int &FldN)
Gets float at field FldN.
Definition: ssmp.cpp:170