SNAP Library 2.2, Developer Reference
2014-03-11 19:15:55
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
00001 00002 #include "stdafx.h" 00003 #include "agmattr.h" 00004 #include "Snap.h" 00005 #include "agm.h" 00006 00007 00008 00009 void TCesna::RandomInit(const int InitComs) { 00010 F.Gen(G->GetNodes()); 00011 SumFV.Gen(InitComs); 00012 NumComs = InitComs; 00013 for (int u = 0; u < F.Len(); u++) { 00014 //assign to just one community 00015 int Mem = G->GetNI(u).GetDeg(); 00016 if (Mem > 10) { Mem = 10; } 00017 for (int c = 0; c < Mem; c++) { 00018 int CID = Rnd.GetUniDevInt(InitComs); 00019 AddCom(u, CID, Rnd.GetUniDev()); 00020 } 00021 } 00022 //assign a member to zero-member community (if any) 00023 for (int c = 0; c < SumFV.Len(); c++) { 00024 if (SumFV[c] == 0.0) { 00025 int UID = Rnd.GetUniDevInt(G->GetNodes()); 00026 AddCom(UID, c, Rnd.GetUniDev()); 00027 } 00028 } 00029 InitW(); 00030 } 00031 00032 void TCesna::NeighborComInit(const int InitComs) { 00033 //initialize with best neighborhood communities (Gleich et.al. KDD'12) 00034 TFltIntPrV NIdPhiV(F.Len(), 0); 00035 TCesnaUtil::GetNIdPhiV<PUNGraph>(G, NIdPhiV); 00036 NeighborComInit(NIdPhiV, InitComs); 00037 } 00038 00039 void TCesna::NeighborComInit(TFltIntPrV& NIdPhiV, const int InitComs) { 00040 //initialize with best neighborhood communities (Gleich et.al. KDD'12) 00041 NIdPhiV.Sort(true); 00042 F.Gen(G->GetNodes()); 00043 SumFV.Gen(InitComs); 00044 NumComs = InitComs; 00045 TIntSet InvalidNIDS(F.Len()); 00046 TIntV ChosenNIDV(InitComs, 0); //FOR DEBUG 00047 //choose nodes with local minimum in conductance 00048 int CurCID = 0; 00049 for (int ui = 0; ui < NIdPhiV.Len(); ui++) { 00050 int UID = NIdPhiV[ui].Val2; 00051 fflush(stdout); 00052 if (InvalidNIDS.IsKey(UID)) { continue; } 00053 ChosenNIDV.Add(UID); //FOR DEBUG 00054 //add the node and its neighbors to the current community 00055 AddCom(UID, CurCID, 1.0); 00056 TUNGraph::TNodeI NI = G->GetNI(UID); 00057 fflush(stdout); 00058 for (int e = 0; e < NI.GetDeg(); e++) { 00059 AddCom(NI.GetNbrNId(e), CurCID, 1.0); 00060 } 00061 //exclude its neighbors from the next considerations 00062 for (int e = 0; e < NI.GetDeg(); e++) { 00063 InvalidNIDS.AddKey(NI.GetNbrNId(e)); 00064 } 00065 CurCID++; 00066 fflush(stdout); 00067 if (CurCID >= NumComs) { break; } 00068 } 00069 if (NumComs > CurCID) { 00070 printf("%d communities needed to fill randomly\n", NumComs - CurCID); 00071 } 00072 //assign a member to zero-member community (if any) 00073 for (int c = 0; c < SumFV.Len(); c++) { 00074 if (SumFV[c] == 0.0) { 00075 int ComSz = 10; 00076 for (int u = 0; u < ComSz; u++) { 00077 int UID = Rnd.GetUniDevInt(G->GetNodes()); 00078 AddCom(UID, c, Rnd.GetUniDev()); 00079 } 00080 } 00081 } 00082 InitW(); 00083 } 00084 00085 void TCesna::SetCmtyVV(const TVec<TIntV>& CmtyVV) { 00086 F.Gen(G->GetNodes()); 00087 SumFV.Gen(CmtyVV.Len()); 00088 NumComs = CmtyVV.Len(); 00089 InitW(); 00090 for (int c = 0; c < CmtyVV.Len(); c++) { 00091 for (int u = 0; u < CmtyVV[c].Len(); u++) { 00092 int UID = CmtyVV[c][u]; 00093 if (! NIDToIdx.IsKey(UID)) { continue; } 00094 AddCom(NIDToIdx.GetKeyId(UID), c, 1.0); 00095 } 00096 } 00097 } 00098 00099 void TCesna::SetGraph(const PUNGraph& GraphPt, const THash<TInt, TIntV>& NIDAttrH) { 00100 HOVIDSV.Gen(GraphPt->GetNodes()); 00101 HOKIDSV.Gen(GraphPt->GetNodes()); 00102 X.Gen(GraphPt->GetNodes()); 00103 TIntV NIDV; 00104 GraphPt->GetNIdV(NIDV); 00105 NIDToIdx.Gen(NIDV.Len()); 00106 NIDToIdx.AddKeyV(NIDV); 00107 // check that nodes IDs are {0,1,..,Nodes-1} 00108 printf("rearrage nodes\n"); 00109 G = TSnap::GetSubGraph(GraphPt, NIDV, true); 00110 for (int nid = 0; nid < G->GetNodes(); nid++) { 00111 IAssert(G->IsNode(nid)); 00112 } 00113 TSnap::DelSelfEdges(G); 00114 00115 PNoCom = 1.0 / (double) G->GetNodes(); 00116 DoParallel = false; 00117 if (1.0 / PNoCom > sqrt(TFlt::Mx)) { PNoCom = 0.99 / sqrt(TFlt::Mx); } // to prevent overflow 00118 NegWgt = 1.0; 00119 00120 // add node attributes, find number of attributes 00121 int NumAttr = 0; 00122 for (int u = 0; u < NIDAttrH.Len(); u++) { 00123 int UID = NIDAttrH.GetKey(u); 00124 if (! NIDToIdx.IsKey(UID)) { continue; } 00125 X[NIDToIdx.GetKeyId(UID)].Gen(NIDAttrH[u].Len()); 00126 for (int k = 0; k < NIDAttrH[u].Len(); k++) { 00127 int KID = NIDAttrH[u][k]; 00128 IAssert (KID >= 0); 00129 X[NIDToIdx.GetKeyId(UID)].AddKey(KID); 00130 if (NumAttr < KID + 1) { NumAttr = KID + 1; } 00131 } 00132 } 00133 Attrs = NumAttr; 00134 InitW(); 00135 } 00136 00137 double TCesna::Likelihood(const bool _DoParallel) { 00138 TExeTm ExeTm; 00139 double L = 0.0; 00140 if (_DoParallel) { 00141 #pragma omp parallel for 00142 for (int u = 0; u < F.Len(); u++) { 00143 double LU = LikelihoodForRow(u); 00144 #pragma omp atomic 00145 L += LU; 00146 } 00147 } 00148 else { 00149 for (int u = 0; u < F.Len(); u++) { 00150 double LU = LikelihoodForRow(u); 00151 L += LU; 00152 } 00153 } 00154 return L; 00155 } 00156 00157 double TCesna::LikelihoodForRow(const int UID) { 00158 return LikelihoodForRow(UID, F[UID]); 00159 } 00160 00161 double TCesna::LikelihoodForRow(const int UID, const TIntFltH& FU) { 00162 double L = 0.0; 00163 TFltV HOSumFV; //adjust for Fv of v hold out 00164 if (HOVIDSV[UID].Len() > 0) { 00165 HOSumFV.Gen(SumFV.Len()); 00166 00167 for (int e = 0; e < HOVIDSV[UID].Len(); e++) { 00168 for (int c = 0; c < SumFV.Len(); c++) { 00169 HOSumFV[c] += GetCom(HOVIDSV[UID][e], c); 00170 } 00171 } 00172 } 00173 00174 TUNGraph::TNodeI NI = G->GetNI(UID); 00175 for (int e = 0; e < NI.GetDeg(); e++) { 00176 int v = NI.GetNbrNId(e); 00177 if (v == UID) { continue; } 00178 if (HOVIDSV[UID].IsKey(v)) { continue; } 00179 L += log (1.0 - Prediction(FU, F[v])) + NegWgt * DotProduct(FU, F[v]); 00180 } 00181 for (TIntFltH::TIter HI = FU.BegI(); HI < FU.EndI(); HI++) { 00182 double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[HI.GetKey()].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist 00183 L -= NegWgt * (SumFV[HI.GetKey()] - HOSum - GetCom(UID, HI.GetKey())) * HI.GetDat(); 00184 } 00185 //add regularization 00186 if (RegCoef > 0.0) { //L1 00187 L -= RegCoef * Sum(FU); 00188 } 00189 if (RegCoef < 0.0) { //L2 00190 L += RegCoef * Norm2(FU); 00191 } 00192 L *= (1.0 - WeightAttr); 00193 // add attribute part 00194 for (int k = 0; k < Attrs; k++) { 00195 if (HOKIDSV[UID].IsKey(k)) { continue; } 00196 L += WeightAttr * LikelihoodAttrKForRow(UID, k, FU); 00197 } 00198 return L; 00199 } 00200 00201 00202 double TCesna::LikelihoodAttrKForRow(const int UID, const int K, const TIntFltH& FU, const TFltV& WK) { 00203 double Prob = PredictAttrK(FU, WK); 00204 double L = 0.0; 00205 if (GetAttr(UID, K)) { 00206 L = Prob == 0.0? -100.0: log(Prob); 00207 } else { 00208 L = Prob == 1.0? -100.0: log(1.0 - Prob); 00209 } 00210 return L; 00211 } 00212 00213 void TCesna::GradientForRow(const int UID, TIntFltH& GradU, const TIntSet& CIDSet) { 00214 GradU.Gen(CIDSet.Len()); 00215 TFltV HOSumFV; //adjust for Fv of v hold out 00216 if (HOVIDSV[UID].Len() > 0) { 00217 HOSumFV.Gen(SumFV.Len()); 00218 00219 for (int e = 0; e < HOVIDSV[UID].Len(); e++) { 00220 for (int c = 0; c < SumFV.Len(); c++) { 00221 HOSumFV[c] += GetCom(HOVIDSV[UID][e], c); 00222 } 00223 } 00224 } 00225 TUNGraph::TNodeI NI = G->GetNI(UID); 00226 int Deg = NI.GetDeg(); 00227 TFltV PredV(Deg), GradV(CIDSet.Len()); 00228 TIntV CIDV(CIDSet.Len()); 00229 for (int e = 0; e < Deg; e++) { 00230 if (NI.GetNbrNId(e) == UID) { continue; } 00231 if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; } 00232 PredV[e] = Prediction(UID, NI.GetNbrNId(e)); 00233 } 00234 00235 for (int c = 0; c < CIDSet.Len(); c++) { 00236 int CID = CIDSet.GetKey(c); 00237 double Val = 0.0; 00238 for (int e = 0; e < Deg; e++) { 00239 int VID = NI.GetNbrNId(e); 00240 if (VID == UID) { continue; } 00241 if (HOVIDSV[UID].IsKey(VID)) { continue; } 00242 Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID); 00243 } 00244 double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist 00245 Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID)); 00246 CIDV[c] = CID; 00247 GradV[c] = Val; 00248 } 00249 //add regularization 00250 if (RegCoef > 0.0) { //L1 00251 for (int c = 0; c < GradV.Len(); c++) { 00252 GradV[c] -= RegCoef; 00253 } 00254 } 00255 if (RegCoef < 0.0) { //L2 00256 for (int c = 0; c < GradV.Len(); c++) { 00257 GradV[c] += 2 * RegCoef * GetCom(UID, CIDV[c]); 00258 } 00259 } 00260 for (int c = 0; c < GradV.Len(); c++) { 00261 GradV[c] *= (1.0 - WeightAttr); 00262 } 00263 //add attribute part 00264 TFltV AttrPredV(Attrs); 00265 for (int k = 0; k < Attrs; k++) { 00266 if (HOKIDSV[UID].IsKey(k)) { continue; } 00267 AttrPredV[k] = PredictAttrK(F[UID], W[k]); 00268 } 00269 for (int c = 0; c < GradV.Len(); c++) { 00270 for (int k = 0; k < Attrs; k++) { 00271 if (HOKIDSV[UID].IsKey(k)) { continue; } 00272 GradV[c] += WeightAttr * (GetAttr(UID, k) - AttrPredV[k]) * GetW(CIDV[c], k); 00273 } 00274 } 00275 00276 00277 for (int c = 0; c < GradV.Len(); c++) { 00278 if (GetCom(UID, CIDV[c]) == 0.0 && GradV[c] < 0.0) { continue; } 00279 if (fabs(GradV[c]) < 0.0001) { continue; } 00280 GradU.AddDat(CIDV[c], GradV[c]); 00281 } 00282 for (int c = 0; c < GradU.Len(); c++) { 00283 if (GradU[c] >= 10) { GradU[c] = 10; } 00284 if (GradU[c] <= -10) { GradU[c] = -10; } 00285 IAssert(GradU[c] >= -10); 00286 } 00287 } 00288 00289 void TCesna::GetCmtyVV(TVec<TIntV>& CmtyVV) { 00290 TVec<TFltV> TmpV; 00291 GetCmtyVV(CmtyVV, TmpV, sqrt(2.0 * (double) G->GetEdges() / G->GetNodes() / G->GetNodes()), 3); 00292 } 00293 00294 00296 void TCesna::GetCmtyVV(TVec<TIntV>& CmtyVV, TVec<TFltV>& Wck, const double Thres, const int MinSz) { 00297 CmtyVV.Gen(NumComs, 0); 00298 Wck.Gen(NumComs, 0); 00299 TIntFltH CIDSumFH(NumComs); 00300 for (int c = 0; c < SumFV.Len(); c++) { 00301 CIDSumFH.AddDat(c, SumFV[c]); 00302 } 00303 CIDSumFH.SortByDat(false); 00304 for (int c = 0; c < NumComs; c++) { 00305 int CID = CIDSumFH.GetKey(c); 00306 TIntFltH NIDFucH(F.Len() / 10); 00307 TIntV CmtyV; 00308 IAssert(SumFV[CID] == CIDSumFH.GetDat(CID)); 00309 if (SumFV[CID] < Thres) { continue; } 00310 for (int u = 0; u < F.Len(); u++) { 00311 int NID = NIDToIdx[u]; 00312 if (GetCom(u, CID) >= Thres) { NIDFucH.AddDat(NID, GetCom(u, CID)); } 00313 } 00314 NIDFucH.SortByDat(false); 00315 NIDFucH.GetKeyV(CmtyV); 00316 if (CmtyV.Len() < MinSz) { continue; } 00317 CmtyVV.Add(CmtyV); 00318 TFltV WV(Attrs); 00319 for (int k = 0; k < Attrs; k++) { 00320 WV[k] = GetW(CID, k); 00321 } 00322 Wck.Add(WV); 00323 } 00324 if ( NumComs != CmtyVV.Len()) { 00325 printf("Community vector generated. %d communities are ommitted\n", NumComs.Val - CmtyVV.Len()); 00326 } 00327 } 00328 00329 void TCesna::GetCmtyVVUnSorted(TVec<TIntV>& CmtyVV) { 00330 GetCmtyVVUnSorted(CmtyVV, sqrt(2.0 * (double) G->GetEdges() / G->GetNodes() / G->GetNodes()), 3); 00331 } 00332 00333 void TCesna::GetCmtyVVUnSorted(TVec<TIntV>& CmtyVV, const double Thres, const int MinSz) { 00334 CmtyVV.Gen(NumComs, 0); 00335 for (int c = 0; c < NumComs; c++) { 00336 TIntV CmtyV; 00337 for (int u = 0; u < G->GetNodes(); u++) { 00338 if (GetCom(u, c) > Thres) { CmtyV.Add(NIDToIdx[u]); } 00339 } 00340 if (CmtyV.Len() >= MinSz) { CmtyVV.Add(CmtyV); } 00341 } 00342 if ( NumComs != CmtyVV.Len()) { 00343 printf("Community vector generated. %d communities are ommitted\n", NumComs.Val - CmtyVV.Len()); 00344 } 00345 } 00346 00348 int TCesna::FindComs(const int NumThreads, const int MaxComs, const int MinComs, const int DivComs, const TStr OutFNm, const bool UseBIC, const double HOFrac, const double StepAlpha, const double StepBeta) { 00349 double ComsGap = exp(TMath::Log((double) MaxComs / (double) MinComs) / (double) DivComs); 00350 TIntV ComsV; 00351 ComsV.Add(MinComs); 00352 while (ComsV.Len() < DivComs) { 00353 int NewComs = int(ComsV.Last() * ComsGap); 00354 if (NewComs == ComsV.Last().Val) { NewComs++; } 00355 ComsV.Add(NewComs); 00356 } 00357 if (ComsV.Last() < MaxComs) { ComsV.Add(MaxComs); } 00358 return FindComs(ComsV, UseBIC, HOFrac, NumThreads, OutFNm, StepAlpha, StepBeta); 00359 } 00360 00361 int TCesna::FindComs(TIntV& ComsV, const bool UseBIC, const double HOFrac, const int NumThreads, const TStr PlotLFNm, const double StepAlpha, const double StepBeta) { 00362 if (ComsV.Len() == 0) { 00363 int MaxComs = G->GetNodes() / 5; 00364 ComsV.Add(2); 00365 while(ComsV.Last() < MaxComs) { ComsV.Add(ComsV.Last() * 2); } 00366 } 00367 int MaxIterCV = 3; 00368 00369 TVec<TVec<TIntSet> > HoldOutSets(MaxIterCV), HoldOutSetsAttr(MaxIterCV); 00370 TFltIntPrV NIdPhiV; 00371 TCesnaUtil::GetNIdPhiV<PUNGraph>(G, NIdPhiV); 00372 if (! UseBIC) { //if edges are many enough, use CV 00373 //printf("generating hold out set\n"); 00374 TIntV NIdV1, NIdV2; 00375 G->GetNIdV(NIdV1); 00376 G->GetNIdV(NIdV2); 00377 for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) { 00378 // generate holdout sets 00379 TCesnaUtil::GenHoldOutPairs(G, HoldOutSets[IterCV], HOFrac, Rnd); 00380 GenHoldOutAttr(HOFrac, HoldOutSetsAttr[IterCV]); 00381 } 00382 //printf("hold out set generated\n"); 00383 } 00384 00385 TFltV HOLV(ComsV.Len()); 00386 TIntFltPrV ComsLV; 00387 for (int c = 0; c < ComsV.Len(); c++) { 00388 const int Coms = ComsV[c]; 00389 //printf("Try number of Coms:%d\n", Coms); 00390 00391 if (! UseBIC) { //if edges are many enough, use CV 00392 for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) { 00393 HOVIDSV = HoldOutSets[IterCV]; 00394 HOKIDSV = HoldOutSetsAttr[IterCV]; 00395 NeighborComInit(NIdPhiV, Coms); 00396 //printf("Initialized\n"); 00397 00398 if (NumThreads == 1) { 00399 //printf("MLE without parallelization begins\n"); 00400 MLEGradAscent(0.01, 100 * G->GetNodes(), "", StepAlpha, StepBeta); 00401 //printf("likelihood: train:%f, attr:%f, hold:%f\n", Likelihood(), LikelihoodAttr(), LikelihoodHoldOut()); 00402 } else { 00403 //printf("MLE with parallelization begins\n"); 00404 MLEGradAscentParallel(0.01, 100, NumThreads, "", StepAlpha, StepBeta); 00405 } 00406 double HOL = LikelihoodHoldOut(); 00407 HOL = HOL < 0? HOL: TFlt::Mn; 00408 HOLV[c] += HOL; 00409 } 00410 } 00411 else { 00412 HOVIDSV.Gen(G->GetNodes()); 00413 HOKIDSV.Gen(G->GetNodes()); 00414 if (NumThreads == 1) { 00415 MLEGradAscent(0.005, 100 * G->GetNodes(), "", StepAlpha, StepBeta); 00416 printf("likelihood: train:%f, attr:%f, hold:%f\n", Likelihood(), LikelihoodAttr(), LikelihoodHoldOut()); 00417 } else { 00418 MLEGradAscentParallel(0.005, 100, NumThreads, "", StepAlpha, StepBeta); 00419 } 00420 //int NumParams = G->GetNodes() * Coms + Coms * Attrs; 00421 double NumParams = (1.0 - WeightAttr) * Coms + WeightAttr * Coms * Attrs; 00422 double Observations = (1.0 - WeightAttr) * G->GetNodes() * (G->GetNodes() - 1) / 2 + WeightAttr * G->GetNodes() * Attrs; 00423 double BIC = 2 * Likelihood() - NumParams * log (Observations); 00424 HOLV[c] = BIC; 00425 } 00426 } 00427 int EstComs = 2; 00428 double MaxL = TFlt::Mn; 00429 if (UseBIC) { 00430 printf("Number of communities vs likelihood (criterion: BIC)\n"); 00431 } else { 00432 printf("Number of communities vs likelihood (criterion: Cross validation)\n"); 00433 } 00434 for (int c = 0; c < ComsV.Len(); c++) { 00435 ComsLV.Add(TIntFltPr(ComsV[c].Val, HOLV[c].Val)); 00436 printf("%d(%f)\t", ComsV[c].Val, HOLV[c].Val); 00437 if (MaxL < HOLV[c]) { 00438 MaxL = HOLV[c]; 00439 EstComs = ComsV[c]; 00440 } 00441 } 00442 printf("\n"); 00443 RandomInit(EstComs); 00444 HOVIDSV.Gen(G->GetNodes()); 00445 HOKIDSV.Gen(G->GetNodes()); 00446 if (! PlotLFNm.Empty()) { 00447 TGnuPlot::PlotValV(ComsLV, PlotLFNm, "hold-out likelihood", "communities", "likelihood"); 00448 } 00449 return EstComs; 00450 } 00451 00452 double TCesna::LikelihoodHoldOut() { 00453 double L = 0.0; 00454 for (int u = 0; u < HOVIDSV.Len(); u++) { 00455 for (int e = 0; e < HOVIDSV[u].Len(); e++) { 00456 int VID = HOVIDSV[u][e]; 00457 if (VID == u) { continue; } 00458 double Pred = Prediction(u, VID); 00459 if (G->IsEdge(u, VID)) { 00460 L += log(1.0 - Pred); 00461 } 00462 else { 00463 L += NegWgt * log(Pred); 00464 } 00465 //printf("NODE %d, %d: Dot Prod: %f, Prediction: %f L:%f\n", u, VID, DotProduct(F[u], F[VID]), Pred, L); 00466 } 00467 } 00468 L *= (1.0 - WeightAttr); 00469 //printf("likelihood for hold out network:%f\n", L); 00470 for (int u = 0; u < HOKIDSV.Len(); u++) { 00471 for (int e = 0; e < HOKIDSV[u].Len(); e++) { 00472 IAssert(HOKIDSV[u][e] < Attrs); 00473 L += WeightAttr * LikelihoodAttrKForRow(u, HOKIDSV[u][e]); 00474 //printf("asbefsafd ATTRIBUTE %d, NODE %d:%f\n", u, HOKIDSV[u][e].Val, LikelihoodAttrKForRow(u, HOKIDSV[u][e])); 00475 } 00476 } 00477 return L; 00478 } 00479 00480 double TCesna::GetStepSizeByLineSearch(const int UID, const TIntFltH& DeltaV, const TIntFltH& GradV, const double& Alpha, const double& Beta, const int MaxIter) { 00481 double StepSize = 1.0; 00482 double InitLikelihood = LikelihoodForRow(UID); 00483 TIntFltH NewVarV(DeltaV.Len()); 00484 for(int iter = 0; iter < MaxIter; iter++) { 00485 for (int i = 0; i < DeltaV.Len(); i++){ 00486 int CID = DeltaV.GetKey(i); 00487 double NewVal = GetCom(UID, CID) + StepSize * DeltaV.GetDat(CID); 00488 if (NewVal < MinVal) { NewVal = MinVal; } 00489 if (NewVal > MaxVal) { NewVal = MaxVal; } 00490 NewVarV.AddDat(CID, NewVal); 00491 } 00492 if (LikelihoodForRow(UID, NewVarV) < InitLikelihood + Alpha * StepSize * DotProduct(GradV, DeltaV)) { 00493 StepSize *= Beta; 00494 } else { 00495 break; 00496 } 00497 if (iter == MaxIter - 1) { 00498 StepSize = 0.0; 00499 break; 00500 } 00501 } 00502 return StepSize; 00503 } 00504 00505 int TCesna::MLEGradAscent(const double& Thres, const int& MaxIter, const TStr PlotNm, const double StepAlpha, const double StepBeta) { 00506 time_t InitTime = time(NULL); 00507 TExeTm ExeTm, CheckTm; 00508 int iter = 0, PrevIter = 0; 00509 TIntFltPrV IterLV; 00510 TUNGraph::TNodeI UI; 00511 double PrevL = TFlt::Mn, CurL = 0.0; 00512 TIntV NIdxV(F.Len(), 0); 00513 for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); } 00514 TIntFltH GradV; 00515 //consider all C 00516 TIntSet CIDSet(NumComs); 00517 for (int c = 0; c < NumComs; c++) { CIDSet.AddKey(c); } 00518 while(iter < MaxIter) { 00519 NIdxV.Shuffle(Rnd); 00520 for (int ui = 0; ui < F.Len(); ui++, iter++) { 00521 int u = NIdxV[ui]; // 00522 /* 00523 //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) 00524 UI = G->GetNI(u); 00525 TIntSet CIDSet(20 * UI.GetDeg()); 00526 for (int e = 0; e < UI.GetDeg(); e++) { 00527 if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; } 00528 TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)]; 00529 for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { 00530 CIDSet.AddKey(CI.GetKey()); 00531 } 00532 } 00533 for (TIntFltH::TIter CI = F[u].BegI(); CI < F[u].EndI(); CI++) { //remove the community membership which U does not share with its neighbors 00534 if (! CIDSet.IsKey(CI.GetKey())) { 00535 DelCom(u, CI.GetKey()); 00536 } 00537 } 00538 if (CIDSet.Empty()) { continue; } 00539 */ 00540 00541 GradientForRow(u, GradV, CIDSet); 00542 if (Norm2(GradV) < 1e-4) { continue; } 00543 double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta); 00544 if (LearnRate == 0.0) { continue; } 00545 for (int ci = 0; ci < GradV.Len(); ci++) { 00546 int CID = GradV.GetKey(ci); 00547 double Change = LearnRate * GradV.GetDat(CID); 00548 double NewFuc = GetCom(u, CID) + Change; 00549 if (NewFuc <= 0.0) { 00550 DelCom(u, CID); 00551 } else { 00552 AddCom(u, CID, NewFuc); 00553 } 00554 } 00555 if (! PlotNm.Empty() && (iter + 1) % G->GetNodes() == 0) { 00556 IterLV.Add(TIntFltPr(iter, Likelihood(false))); 00557 } 00558 } 00559 // fit W (logistic regression) 00560 for (int k = 0; k < Attrs; k++) { 00561 TFltV GradWV(NumComs); 00562 GradientForWK(GradWV, k); 00563 if (TLinAlg::Norm2(GradWV) < 1e-4) { continue; } 00564 double LearnRate = GetStepSizeByLineSearchForWK(k, GradWV, GradWV, StepAlpha, StepBeta); 00565 if (LearnRate == 0.0) { continue; } 00566 for (int c = 0; c < GradWV.Len(); c++){ 00567 W[k][c] += LearnRate * GradWV[c]; 00568 if (W[k][c] < MinValW) { W[k][c] = MinValW; } 00569 if (W[k][c] > MaxValW) { W[k][c] = MaxValW; } 00570 } 00571 } 00572 printf("\r%d iterations (%f) [%lu sec]", iter, CurL, time(NULL) - InitTime); 00573 fflush(stdout); 00574 if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) { 00575 PrevIter = iter; 00576 CurL = Likelihood(); 00577 if (PrevL > TFlt::Mn && ! PlotNm.Empty()) { 00578 printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL, CurL - PrevL); 00579 } 00580 fflush(stdout); 00581 if (CurL - PrevL <= Thres * fabs(PrevL)) { break; } 00582 else { PrevL = CurL; } 00583 } 00584 00585 } 00586 printf("\n"); 00587 printf("MLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr()); 00588 if (! PlotNm.Empty()) { 00589 TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); 00590 } 00591 return iter; 00592 } 00593 00594 int TCesna::MLEGradAscentParallel(const double& Thres, const int& MaxIter, const int ChunkNum, const int ChunkSize, const TStr PlotNm, const double StepAlpha, const double StepBeta) { 00595 //parallel 00596 time_t InitTime = time(NULL); 00597 uint64 StartTm = TSecTm::GetCurTm().GetAbsSecs(); 00598 TExeTm ExeTm, CheckTm; 00599 double PrevL = Likelihood(true); 00600 TIntFltPrV IterLV; 00601 int PrevIter = 0; 00602 int iter = 0; 00603 TIntV NIdxV(F.Len(), 0); 00604 for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); } 00605 TIntV NIDOPTV(F.Len()); //check if a node needs optimization or not 1: does not require optimization 00606 NIDOPTV.PutAll(0); 00607 TVec<TIntFltH> NewF(ChunkNum * ChunkSize); 00608 TIntV NewNIDV(ChunkNum * ChunkSize); 00609 for (iter = 0; iter < MaxIter; iter++) { 00610 NIdxV.Clr(false); 00611 for (int i = 0; i < F.Len(); i++) { 00612 if (NIDOPTV[i] == 0) { NIdxV.Add(i); } 00613 } 00614 IAssert (NIdxV.Len() <= F.Len()); 00615 NIdxV.Shuffle(Rnd); 00616 // compute gradient for chunk of nodes 00617 #pragma omp parallel for schedule(static, 1) 00618 for (int TIdx = 0; TIdx < ChunkNum; TIdx++) { 00619 TIntFltH GradV; 00620 for (int ui = TIdx * ChunkSize; ui < (TIdx + 1) * ChunkSize; ui++) { 00621 NewNIDV[ui] = -1; 00622 if (ui >= NIdxV.Len()) { continue; } 00623 int u = NIdxV[ui]; // 00624 //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) 00625 TUNGraph::TNodeI UI = G->GetNI(u); 00626 TIntSet CIDSet(5 * UI.GetDeg()); 00627 TIntFltH CurFU = F[u]; 00628 for (int e = 0; e < UI.GetDeg(); e++) { 00629 if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; } 00630 TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)]; 00631 for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { 00632 CIDSet.AddKey(CI.GetKey()); 00633 } 00634 } 00635 if (CIDSet.Empty()) { 00636 CurFU.Clr(); 00637 } 00638 else { 00639 for (TIntFltH::TIter CI = CurFU.BegI(); CI < CurFU.EndI(); CI++) { //remove the community membership which U does not share with its neighbors 00640 if (! CIDSet.IsKey(CI.GetKey())) { 00641 CurFU.DelIfKey(CI.GetKey()); 00642 } 00643 } 00644 GradientForRow(u, GradV, CIDSet); 00645 if (Norm2(GradV) < 1e-4) { NIDOPTV[u] = 1; continue; } 00646 double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta); 00647 if (LearnRate == 0.0) { NewNIDV[ui] = -2; continue; } 00648 for (int ci = 0; ci < GradV.Len(); ci++) { 00649 int CID = GradV.GetKey(ci); 00650 double Change = LearnRate * GradV.GetDat(CID); 00651 double NewFuc = CurFU.IsKey(CID)? CurFU.GetDat(CID) + Change : Change; 00652 if (NewFuc <= 0.0) { 00653 CurFU.DelIfKey(CID); 00654 } else { 00655 CurFU.AddDat(CID) = NewFuc; 00656 } 00657 } 00658 CurFU.Defrag(); 00659 } 00660 //store changes 00661 NewF[ui] = CurFU; 00662 NewNIDV[ui] = u; 00663 } 00664 } 00665 int NumNoChangeGrad = 0; 00666 int NumNoChangeStepSize = 0; 00667 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 00668 int NewNID = NewNIDV[ui]; 00669 if (NewNID == -1) { NumNoChangeGrad++; continue; } 00670 if (NewNID == -2) { NumNoChangeStepSize++; continue; } 00671 for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) { 00672 SumFV[CI.GetKey()] -= CI.GetDat(); 00673 } 00674 } 00675 #pragma omp parallel for 00676 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 00677 int NewNID = NewNIDV[ui]; 00678 if (NewNID < 0) { continue; } 00679 F[NewNID] = NewF[ui]; 00680 } 00681 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 00682 int NewNID = NewNIDV[ui]; 00683 if (NewNID < 0) { continue; } 00684 for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) { 00685 SumFV[CI.GetKey()] += CI.GetDat(); 00686 } 00687 } 00688 // update the nodes who are optimal 00689 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 00690 int NewNID = NewNIDV[ui]; 00691 if (NewNID < 0) { continue; } 00692 TUNGraph::TNodeI UI = G->GetNI(NewNID); 00693 NIDOPTV[NewNID] = 0; 00694 for (int e = 0; e < UI.GetDeg(); e++) { 00695 NIDOPTV[UI.GetNbrNId(e)] = 0; 00696 } 00697 } 00698 int OPTCnt = 0; 00699 for (int i = 0; i < NIDOPTV.Len(); i++) { if (NIDOPTV[i] == 1) { OPTCnt++; } } 00700 // fit W (logistic regression) 00701 if (! PlotNm.Empty()) { 00702 printf("\r%d iterations [%s] %s secs", iter * ChunkSize * ChunkNum, ExeTm.GetTmStr(), TUInt64::GetStr(TSecTm::GetCurTm().GetAbsSecs()-StartTm).CStr()); 00703 if (PrevL > TFlt::Mn) { printf(" (%f) %d g %d s %d OPT", PrevL, NumNoChangeGrad, NumNoChangeStepSize, OPTCnt); } 00704 fflush(stdout); 00705 } 00706 if (iter == 0 || (iter - PrevIter) * ChunkSize * ChunkNum >= G->GetNodes()) { 00707 #pragma omp parallel for 00708 for (int k = 0; k < Attrs; k++) { 00709 TFltV GradWV(NumComs); 00710 GradientForWK(GradWV, k); 00711 if (TLinAlg::Norm2(GradWV) < 1e-4) { continue; } 00712 double LearnRate = GetStepSizeByLineSearchForWK(k, GradWV, GradWV, StepAlpha, StepBeta); 00713 if (LearnRate == 0.0) { continue; } 00714 for (int c = 0; c < GradWV.Len(); c++){ 00715 W[k][c] += LearnRate * GradWV[c]; 00716 if (W[k][c] < MinValW) { W[k][c] = MinValW; } 00717 if (W[k][c] > MaxValW) { W[k][c] = MaxValW; } 00718 } 00719 } 00720 PrevIter = iter; 00721 double CurL = Likelihood(true); 00722 IterLV.Add(TIntFltPr(iter * ChunkSize * ChunkNum, CurL)); 00723 printf("\r%d iterations, Likelihood: %f, Diff: %f [%lu secs]", iter, CurL, CurL - PrevL, time(NULL) - InitTime); 00724 fflush(stdout); 00725 if (CurL - PrevL <= Thres * fabs(PrevL)) { 00726 break; 00727 } 00728 else { 00729 PrevL = CurL; 00730 } 00731 } 00732 } 00733 if (! PlotNm.Empty()) { 00734 printf("\nMLE completed with %d iterations(%s secs)\n", iter, TUInt64::GetStr(TSecTm::GetCurTm().GetAbsSecs()-StartTm).CStr()); 00735 TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); 00736 } else { 00737 printf("\rMLE completed with %d iterations(%lu secs)", iter, time(NULL) - InitTime); 00738 fflush(stdout); 00739 } 00740 return iter; 00741 }