SNAP Library 6.0, Developer Reference  2020-12-09 16:24:20
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
word2vec.cpp
Go to the documentation of this file.
1 #include "stdafx.h"
2 #include "Snap.h"
3 #include "word2vec.h"
4 
5 //Code from https://github.com/nicholas-leonard/word2vec/blob/master/word2vec.c
6 //Customized for SNAP and node2vec
7 
8 void LearnVocab(TVVec<TInt, int64>& WalksVV, TIntV& Vocab) {
9  for( int64 i = 0; i < Vocab.Len(); i++) { Vocab[i] = 0; }
10  for( int64 i = 0; i < WalksVV.GetXDim(); i++) {
11  for( int j = 0; j < WalksVV.GetYDim(); j++) {
12  Vocab[WalksVV(i,j)]++;
13  }
14  }
15 }
16 
17 //Precompute unigram table using alias sampling method
18 void InitUnigramTable(TIntV& Vocab, TIntV& KTable, TFltV& UTable) {
19  double TrainWordsPow = 0;
20  double Pwr = 0.75;
21  TFltV ProbV(Vocab.Len());
22  for (int64 i = 0; i < Vocab.Len(); i++) {
23  ProbV[i]=TMath::Power(Vocab[i],Pwr);
24  TrainWordsPow += ProbV[i];
25  KTable[i]=0;
26  UTable[i]=0;
27  }
28  for (int64 i = 0; i < ProbV.Len(); i++) {
29  ProbV[i] /= TrainWordsPow;
30  }
31  TIntV UnderV;
32  TIntV OverV;
33  for (int64 i = 0; i < ProbV.Len(); i++) {
34  UTable[i] = ProbV[i] * ProbV.Len();
35  if ( UTable[i] < 1 ) {
36  UnderV.Add(i);
37  } else {
38  OverV.Add(i);
39  }
40  }
41  while(UnderV.Len() > 0 && OverV.Len() > 0) {
42  int64 Small = UnderV.Last();
43  int64 Large = OverV.Last();
44  UnderV.DelLast();
45  OverV.DelLast();
46  KTable[Small] = Large;
47  UTable[Large] = (UTable[Large] + UTable[Small]) - 1;
48  if (UTable[Large] < 1) {
49  UnderV.Add(Large);
50  } else {
51  OverV.Add(Large);
52  }
53  }
54  while(UnderV.Len() > 0){
55  int64 curr = UnderV.Last();
56  UnderV.DelLast();
57  UTable[curr]=1;
58  }
59  while(OverV.Len() > 0){
60  int64 curr = OverV.Last();
61  OverV.DelLast();
62  UTable[curr]=1;
63  }
64 }
65 
66 int64 RndUnigramInt(TIntV& KTable, TFltV& UTable, TRnd& Rnd) {
67  TInt X = KTable[static_cast<int64>(Rnd.GetUniDev()*KTable.Len())];
68  double Y = Rnd.GetUniDev();
69  return Y < UTable[X] ? X : KTable[X];
70 }
71 
72 //Initialize negative embeddings
73 void InitNegEmb(TIntV& Vocab, const int& Dimensions, TVVec<TFlt, int64>& SynNeg) {
74  SynNeg = TVVec<TFlt, int64>(Vocab.Len(),Dimensions);
75  for (int64 i = 0; i < SynNeg.GetXDim(); i++) {
76  for (int j = 0; j < SynNeg.GetYDim(); j++) {
77  SynNeg(i,j) = 0;
78  }
79  }
80 }
81 
82 //Initialize positive embeddings
83 void InitPosEmb(TIntV& Vocab, const int& Dimensions, TRnd& Rnd, TVVec<TFlt, int64>& SynPos) {
84  SynPos = TVVec<TFlt, int64>(Vocab.Len(),Dimensions);
85  for (int64 i = 0; i < SynPos.GetXDim(); i++) {
86  for (int j = 0; j < SynPos.GetYDim(); j++) {
87  SynPos(i,j) =(Rnd.GetUniDev()-0.5)/Dimensions;
88  }
89  }
90 }
91 
92 void TrainModel(TVVec<TInt, int64>& WalksVV, const int& Dimensions,
93  const int& WinSize, const int& Iter, const bool& Verbose,
94  TIntV& KTable, TFltV& UTable, int64& WordCntAll, TFltV& ExpTable,
95  double& Alpha, int64 CurrWalk, TRnd& Rnd,
96  TVVec<TFlt, int64>& SynNeg, TVVec<TFlt, int64>& SynPos) {
97  TFltV Neu1V(Dimensions);
98  TFltV Neu1eV(Dimensions);
99  int64 AllWords = WalksVV.GetXDim()*WalksVV.GetYDim();
100  TIntV WalkV(WalksVV.GetYDim());
101  for (int j = 0; j < WalksVV.GetYDim(); j++) { WalkV[j] = WalksVV(CurrWalk,j); }
102  for (int64 WordI=0; WordI<WalkV.Len(); WordI++) {
103  if ( WordCntAll%10000 == 0 ) {
104  if ( Verbose ) {
105  printf("\rLearning Progress: %.2lf%% ",(double)WordCntAll*100/(double)(Iter*AllWords));
106  fflush(stdout);
107  }
108  Alpha = StartAlpha * (1 - WordCntAll / static_cast<double>(Iter * AllWords + 1));
109  if ( Alpha < StartAlpha * 0.0001 ) { Alpha = StartAlpha * 0.0001; }
110  }
111  int64 Word = WalkV[WordI];
112  for (int i = 0; i < Dimensions; i++) {
113  Neu1V[i] = 0;
114  Neu1eV[i] = 0;
115  }
116  int Offset = Rnd.GetUniDevInt() % WinSize;
117  for (int a = Offset; a < WinSize * 2 + 1 - Offset; a++) {
118  if (a == WinSize) { continue; }
119  int64 CurrWordI = WordI - WinSize + a;
120  if (CurrWordI < 0){ continue; }
121  if (CurrWordI >= WalkV.Len()){ continue; }
122  int64 CurrWord = WalkV[CurrWordI];
123  for (int i = 0; i < Dimensions; i++) { Neu1eV[i] = 0; }
124  //negative sampling
125  for (int j = 0; j < NegSamN+1; j++) {
126  int64 Target, Label;
127  if (j == 0) {
128  Target = Word;
129  Label = 1;
130  } else {
131  Target = RndUnigramInt(KTable, UTable, Rnd);
132  if (Target == Word) { continue; }
133  Label = 0;
134  }
135  double Product = 0;
136  for (int i = 0; i < Dimensions; i++) {
137  Product += SynPos(CurrWord,i) * SynNeg(Target,i);
138  }
139  double Grad; //Gradient multiplied by learning rate
140  if (Product > MaxExp) { Grad = (Label - 1) * Alpha; }
141  else if (Product < -MaxExp) { Grad = Label * Alpha; }
142  else {
143  double Exp = ExpTable[static_cast<int>(Product*ExpTablePrecision)+TableSize/2];
144  Grad = (Label - 1 + 1 / (1 + Exp)) * Alpha;
145  }
146  for (int i = 0; i < Dimensions; i++) {
147  Neu1eV[i] += Grad * SynNeg(Target,i);
148  SynNeg(Target,i) += Grad * SynPos(CurrWord,i);
149  }
150  }
151  for (int i = 0; i < Dimensions; i++) {
152  SynPos(CurrWord,i) += Neu1eV[i];
153  }
154  }
155  WordCntAll++;
156  }
157 }
158 
159 
160 void LearnEmbeddings(TVVec<TInt, int64>& WalksVV, const int& Dimensions,
161  const int& WinSize, const int& Iter, const bool& Verbose,
162  TIntFltVH& EmbeddingsHV) {
163  TIntIntH RnmH;
164  TIntIntH RnmBackH;
165  int64 NNodes = 0;
166  //renaming nodes into consecutive numbers
167  for (int i = 0; i < WalksVV.GetXDim(); i++) {
168  for (int64 j = 0; j < WalksVV.GetYDim(); j++) {
169  if ( RnmH.IsKey(WalksVV(i, j)) ) {
170  WalksVV(i, j) = RnmH.GetDat(WalksVV(i, j));
171  } else {
172  RnmH.AddDat(WalksVV(i,j),NNodes);
173  RnmBackH.AddDat(NNodes,WalksVV(i, j));
174  WalksVV(i, j) = NNodes++;
175  }
176  }
177  }
178  TIntV Vocab(NNodes);
179  LearnVocab(WalksVV, Vocab);
180  TIntV KTable(NNodes);
181  TFltV UTable(NNodes);
182  TVVec<TFlt, int64> SynNeg;
183  TVVec<TFlt, int64> SynPos;
184  TRnd Rnd(time(NULL));
185  InitPosEmb(Vocab, Dimensions, Rnd, SynPos);
186  InitNegEmb(Vocab, Dimensions, SynNeg);
187  InitUnigramTable(Vocab, KTable, UTable);
188  TFltV ExpTable(TableSize);
189  double Alpha = StartAlpha; //learning rate
190 #pragma omp parallel for schedule(dynamic)
191  for (int i = 0; i < TableSize; i++ ) {
192  double Value = -MaxExp + static_cast<double>(i) / static_cast<double>(ExpTablePrecision);
193  ExpTable[i] = TMath::Power(TMath::E, Value);
194  }
195  int64 WordCntAll = 0;
196 // op RS 2016/09/26, collapse does not compile on Mac OS X
197 //#pragma omp parallel for schedule(dynamic) collapse(2)
198  for (int j = 0; j < Iter; j++) {
199 #pragma omp parallel for schedule(dynamic)
200  for (int64 i = 0; i < WalksVV.GetXDim(); i++) {
201  TrainModel(WalksVV, Dimensions, WinSize, Iter, Verbose, KTable, UTable,
202  WordCntAll, ExpTable, Alpha, i, Rnd, SynNeg, SynPos);
203  }
204  }
205  if (Verbose) { printf("\n"); fflush(stdout); }
206  for (int64 i = 0; i < SynPos.GetXDim(); i++) {
207  TFltV CurrV(SynPos.GetYDim());
208  for (int j = 0; j < SynPos.GetYDim(); j++) { CurrV[j] = SynPos(i, j); }
209  EmbeddingsHV.AddDat(RnmBackH.GetDat(i), CurrV);
210  }
211 }
Definition: dt.h:11
void InitUnigramTable(TIntV &Vocab, TIntV &KTable, TFltV &UTable)
Definition: word2vec.cpp:18
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
void LearnEmbeddings(TVVec< TInt, int64 > &WalksVV, const int &Dimensions, const int &WinSize, const int &Iter, const bool &Verbose, TIntFltVH &EmbeddingsHV)
Learns embeddings using SGD, Skip-gram with negative sampling.
Definition: word2vec.cpp:160
void InitNegEmb(TIntV &Vocab, const int &Dimensions, TVVec< TFlt, int64 > &SynNeg)
Definition: word2vec.cpp:73
const TDat & GetDat(const TKey &Key) const
Definition: hash.h:262
const int MaxExp
Definition: word2vec.h:10
Definition: ds.h:2223
TSizeTy GetYDim() const
Definition: ds.h:2251
static double Power(const double &Base, const double &Exponent)
Definition: xmath.h:25
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
Definition: dt.h:1137
void TrainModel(TVVec< TInt, int64 > &WalksVV, const int &Dimensions, const int &WinSize, const int &Iter, const bool &Verbose, TIntV &KTable, TFltV &UTable, int64 &WordCntAll, TFltV &ExpTable, double &Alpha, int64 CurrWalk, TRnd &Rnd, TVVec< TFlt, int64 > &SynNeg, TVVec< TFlt, int64 > &SynPos)
Definition: word2vec.cpp:92
long long int64
Definition: bd.h:27
const double StartAlpha
Definition: word2vec.h:20
TSizeTy GetXDim() const
Definition: ds.h:2250
Definition: hash.h:97
double GetUniDev()
Definition: dt.h:30
const int ExpTablePrecision
Definition: word2vec.h:13
void LearnVocab(TVVec< TInt, int64 > &WalksVV, TIntV &Vocab)
Definition: word2vec.cpp:8
const int TableSize
Definition: word2vec.h:14
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
int64 RndUnigramInt(TIntV &KTable, TFltV &UTable, TRnd &Rnd)
Definition: word2vec.cpp:66
const int NegSamN
Definition: word2vec.h:17
bool IsKey(const TKey &Key) const
Definition: hash.h:258
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
void DelLast()
Removes the last element of the vector.
Definition: ds.h:665
TDat & AddDat(const TKey &Key)
Definition: hash.h:238
static double E
Definition: xmath.h:7
void InitPosEmb(TIntV &Vocab, const int &Dimensions, TRnd &Rnd, TVVec< TFlt, int64 > &SynPos)
Definition: word2vec.cpp:83