SNAP Library 2.0, User Reference
2013-05-13 16:33:57
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
00001 00002 #include "stdafx.h" 00003 #include "agmfast.h" 00004 #include "Snap.h" 00005 #include "agm.h" 00006 00007 void TAGMFast::Save(TSOut& SOut) { 00008 G->Save(SOut); 00009 F.Save(SOut); 00010 NIDV.Save(SOut); 00011 RegCoef.Save(SOut); 00012 SumFV.Save(SOut); 00013 NodesOk.Save(SOut); 00014 MinVal.Save(SOut); 00015 MaxVal.Save(SOut); 00016 NegWgt.Save(SOut); 00017 NumComs.Save(SOut); 00018 HOVIDSV.Save(SOut); 00019 PNoCom.Save(SOut); 00020 } 00021 00022 void TAGMFast::Load(TSIn& SIn, const int& RndSeed) { 00023 G->Load(SIn); 00024 F.Load(SIn); 00025 NIDV.Load(SIn); 00026 RegCoef.Load(SIn); 00027 SumFV.Load(SIn); 00028 NodesOk.Load(SIn); 00029 MinVal.Load(SIn); 00030 MaxVal.Load(SIn); 00031 NegWgt.Load(SIn); 00032 NumComs.Load(SIn); 00033 HOVIDSV.Load(SIn); 00034 PNoCom.Load(SIn); 00035 Rnd.PutSeed(RndSeed); 00036 } 00037 00038 void TAGMFast::RandomInit(const int InitComs) { 00039 F.Gen(G->GetNodes()); 00040 SumFV.Gen(InitComs); 00041 NumComs = InitComs; 00042 for (int u = 0; u < F.Len(); u++) { 00043 //assign to just one community 00044 int Mem = G->GetNI(u).GetDeg(); 00045 if (Mem > 10) { Mem = 10; } 00046 for (int c = 0; c < Mem; c++) { 00047 int CID = Rnd.GetUniDevInt(InitComs); 00048 AddCom(u, CID, Rnd.GetUniDev()); 00049 } 00050 } 00051 //assign a member to zero-member community (if any) 00052 for (int c = 0; c < SumFV.Len(); c++) { 00053 if (SumFV[c] == 0.0) { 00054 int UID = Rnd.GetUniDevInt(G->GetNodes()); 00055 AddCom(UID, c, Rnd.GetUniDev()); 00056 } 00057 } 00058 } 00059 00060 void TAGMFast::NeighborComInit(const int InitComs) { 00061 //initialize with best neighborhood communities (Gleich et.al. KDD'12) 00062 F.Gen(G->GetNodes()); 00063 SumFV.Gen(InitComs); 00064 NumComs = InitComs; 00065 const int Edges = G->GetEdges(); 00066 //TIntFltH NCPhiH(F.Len()); 00067 TFltIntPrV NIdPhiV(F.Len(), 0); 00068 TIntSet InvalidNIDS(F.Len()); 00069 TIntV ChosenNIDV(InitComs, 0); //FOR DEBUG 00070 TExeTm RunTm; 00071 //compute conductance of neighborhood community 00072 for (int u = 0; u < F.Len(); u++) { 00073 TIntSet NBCmty(G->GetNI(u).GetDeg() + 1); 00074 double Phi; 00075 if (G->GetNI(u).GetDeg() < 5) { //do not include nodes with too few degree 00076 Phi = 1.0; 00077 } else { 00078 TAGMUtil::GetNbhCom(G, u, NBCmty); 00079 IAssert(NBCmty.Len() == G->GetNI(u).GetDeg() + 1); 00080 Phi = TAGMUtil::GetConductance(G, NBCmty, Edges); 00081 } 00082 //NCPhiH.AddDat(u, Phi); 00083 NIdPhiV.Add(TFltIntPr(Phi, u)); 00084 } 00085 NIdPhiV.Sort(true); 00086 printf("conductance computation completed [%s]\n", RunTm.GetTmStr()); 00087 fflush(stdout); 00088 //choose nodes with local minimum in conductance 00089 int CurCID = 0; 00090 for (int ui = 0; ui < NIdPhiV.Len(); ui++) { 00091 int UID = NIdPhiV[ui].Val2; 00092 fflush(stdout); 00093 if (InvalidNIDS.IsKey(UID)) { continue; } 00094 ChosenNIDV.Add(UID); //FOR DEBUG 00095 //add the node and its neighbors to the current community 00096 AddCom(UID, CurCID, 1.0); 00097 TUNGraph::TNodeI NI = G->GetNI(UID); 00098 fflush(stdout); 00099 for (int e = 0; e < NI.GetDeg(); e++) { 00100 AddCom(NI.GetNbrNId(e), CurCID, 1.0); 00101 } 00102 //exclude its neighbors from the next considerations 00103 for (int e = 0; e < NI.GetDeg(); e++) { 00104 InvalidNIDS.AddKey(NI.GetNbrNId(e)); 00105 } 00106 CurCID++; 00107 fflush(stdout); 00108 if (CurCID >= NumComs) { break; } 00109 } 00110 if (NumComs > CurCID) { 00111 printf("%d communities needed to fill randomly\n", NumComs - CurCID); 00112 } 00113 //assign a member to zero-member community (if any) 00114 for (int c = 0; c < SumFV.Len(); c++) { 00115 if (SumFV[c] == 0.0) { 00116 int ComSz = 10; 00117 for (int u = 0; u < ComSz; u++) { 00118 int UID = Rnd.GetUniDevInt(G->GetNodes()); 00119 AddCom(UID, c, Rnd.GetUniDev()); 00120 } 00121 } 00122 } 00123 } 00124 00125 void TAGMFast::SetCmtyVV(const TVec<TIntV>& CmtyVV) { 00126 F.Gen(G->GetNodes()); 00127 SumFV.Gen(CmtyVV.Len()); 00128 NumComs = CmtyVV.Len(); 00129 TIntH NIDIdxH(NIDV.Len()); 00130 if (! NodesOk) { 00131 for (int u = 0; u < NIDV.Len(); u++) { 00132 NIDIdxH.AddDat(NIDV[u], u); 00133 } 00134 } 00135 for (int c = 0; c < CmtyVV.Len(); c++) { 00136 for (int u = 0; u < CmtyVV[c].Len(); u++) { 00137 int UID = CmtyVV[c][u]; 00138 if (! NodesOk) { UID = NIDIdxH.GetDat(UID); } 00139 if (G->IsNode(UID)) { 00140 AddCom(UID, c, 1.0); 00141 } 00142 } 00143 } 00144 } 00145 00146 void TAGMFast::SetGraph(const PUNGraph& GraphPt) { 00147 G = GraphPt; 00148 HOVIDSV.Gen(G->GetNodes()); 00149 NodesOk = true; 00150 GraphPt->GetNIdV(NIDV); 00151 // check that nodes IDs are {0,1,..,Nodes-1} 00152 for (int nid = 0; nid < GraphPt->GetNodes(); nid++) { 00153 if (! GraphPt->IsNode(nid)) { 00154 NodesOk = false; 00155 break; 00156 } 00157 } 00158 if (! NodesOk) { 00159 printf("rearrage nodes\n"); 00160 G = TSnap::GetSubGraph(GraphPt, NIDV, true); 00161 for (int nid = 0; nid < G->GetNodes(); nid++) { 00162 IAssert(G->IsNode(nid)); 00163 } 00164 } 00165 TSnap::DelSelfEdges(G); 00166 00167 PNoCom = 1.0 / (double) G->GetNodes(); 00168 DoParallel = false; 00169 if (1.0 / PNoCom > sqrt(TFlt::Mx)) { PNoCom = 0.99 / sqrt(TFlt::Mx); } // to prevent overflow 00170 NegWgt = 1.0; 00171 } 00172 00173 double TAGMFast::Likelihood(const bool _DoParallel) { 00174 TExeTm ExeTm; 00175 double L = 0.0; 00176 if (_DoParallel) { 00177 #pragma omp parallel for 00178 for (int u = 0; u < F.Len(); u++) { 00179 double LU = LikelihoodForRow(u); 00180 #pragma omp atomic 00181 L += LU; 00182 } 00183 } 00184 else { 00185 for (int u = 0; u < F.Len(); u++) { 00186 double LU = LikelihoodForRow(u); 00187 L += LU; 00188 } 00189 } 00190 return L; 00191 } 00192 00193 double TAGMFast::LikelihoodForRow(const int UID) { 00194 return LikelihoodForRow(UID, F[UID]); 00195 } 00196 00197 00198 double TAGMFast::LikelihoodForRow(const int UID, const TIntFltH& FU) { 00199 double L = 0.0; 00200 TFltV HOSumFV; //adjust for Fv of v hold out 00201 if (HOVIDSV[UID].Len() > 0) { 00202 HOSumFV.Gen(SumFV.Len()); 00203 00204 for (int e = 0; e < HOVIDSV[UID].Len(); e++) { 00205 for (int c = 0; c < SumFV.Len(); c++) { 00206 HOSumFV[c] += GetCom(HOVIDSV[UID][e], c); 00207 } 00208 } 00209 } 00210 00211 TUNGraph::TNodeI NI = G->GetNI(UID); 00212 if (DoParallel && NI.GetDeg() > 10) { 00213 #pragma omp parallel for schedule(static, 1) 00214 for (int e = 0; e < NI.GetDeg(); e++) { 00215 int v = NI.GetNbrNId(e); 00216 if (v == UID) { continue; } 00217 if (HOVIDSV[UID].IsKey(v)) { continue; } 00218 double LU = log (1.0 - Prediction(FU, F[v])) + NegWgt * DotProduct(FU, F[v]); 00219 #pragma omp atomic 00220 L += LU; 00221 } 00222 for (TIntFltH::TIter HI = FU.BegI(); HI < FU.EndI(); HI++) { 00223 double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[HI.GetKey()].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist 00224 double LU = NegWgt * (SumFV[HI.GetKey()] - HOSum - GetCom(UID, HI.GetKey())) * HI.GetDat(); 00225 L -= LU; 00226 } 00227 } else { 00228 for (int e = 0; e < NI.GetDeg(); e++) { 00229 int v = NI.GetNbrNId(e); 00230 if (v == UID) { continue; } 00231 if (HOVIDSV[UID].IsKey(v)) { continue; } 00232 L += log (1.0 - Prediction(FU, F[v])) + NegWgt * DotProduct(FU, F[v]); 00233 } 00234 for (TIntFltH::TIter HI = FU.BegI(); HI < FU.EndI(); HI++) { 00235 double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[HI.GetKey()].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist 00236 L -= NegWgt * (SumFV[HI.GetKey()] - HOSum - GetCom(UID, HI.GetKey())) * HI.GetDat(); 00237 } 00238 } 00239 //add regularization 00240 if (RegCoef > 0.0) { //L1 00241 L -= RegCoef * Sum(FU); 00242 } 00243 if (RegCoef < 0.0) { //L2 00244 L += RegCoef * Norm2(FU); 00245 } 00246 00247 return L; 00248 } 00249 00250 void TAGMFast::GradientForRow(const int UID, TIntFltH& GradU, const TIntSet& CIDSet) { 00251 GradU.Gen(CIDSet.Len()); 00252 00253 TFltV HOSumFV; //adjust for Fv of v hold out 00254 if (HOVIDSV[UID].Len() > 0) { 00255 HOSumFV.Gen(SumFV.Len()); 00256 00257 for (int e = 0; e < HOVIDSV[UID].Len(); e++) { 00258 for (int c = 0; c < SumFV.Len(); c++) { 00259 HOSumFV[c] += GetCom(HOVIDSV[UID][e], c); 00260 } 00261 } 00262 } 00263 00264 TUNGraph::TNodeI NI = G->GetNI(UID); 00265 int Deg = NI.GetDeg(); 00266 TFltV PredV(Deg), GradV(CIDSet.Len()); 00267 TIntV CIDV(CIDSet.Len()); 00268 if (DoParallel && Deg + CIDSet.Len() > 10) { 00269 #pragma omp parallel for schedule(static, 1) 00270 for (int e = 0; e < Deg; e++) { 00271 if (NI.GetNbrNId(e) == UID) { continue; } 00272 if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; } 00273 PredV[e] = Prediction(UID, NI.GetNbrNId(e)); 00274 } 00275 00276 #pragma omp parallel for schedule(static, 1) 00277 for (int c = 0; c < CIDSet.Len(); c++) { 00278 int CID = CIDSet.GetKey(c); 00279 double Val = 0.0; 00280 for (int e = 0; e < Deg; e++) { 00281 int VID = NI.GetNbrNId(e); 00282 if (VID == UID) { continue; } 00283 if (HOVIDSV[UID].IsKey(VID)) { continue; } 00284 Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID); 00285 } 00286 double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist 00287 Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID)); 00288 CIDV[c] = CID; 00289 GradV[c] = Val; 00290 } 00291 } 00292 else { 00293 for (int e = 0; e < Deg; e++) { 00294 if (NI.GetNbrNId(e) == UID) { continue; } 00295 if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; } 00296 PredV[e] = Prediction(UID, NI.GetNbrNId(e)); 00297 } 00298 00299 for (int c = 0; c < CIDSet.Len(); c++) { 00300 int CID = CIDSet.GetKey(c); 00301 double Val = 0.0; 00302 for (int e = 0; e < Deg; e++) { 00303 int VID = NI.GetNbrNId(e); 00304 if (VID == UID) { continue; } 00305 if (HOVIDSV[UID].IsKey(VID)) { continue; } 00306 Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID); 00307 } 00308 double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist 00309 Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID)); 00310 CIDV[c] = CID; 00311 GradV[c] = Val; 00312 } 00313 } 00314 //add regularization 00315 if (RegCoef > 0.0) { //L1 00316 for (int c = 0; c < GradV.Len(); c++) { 00317 GradV[c] -= RegCoef; 00318 } 00319 } 00320 if (RegCoef < 0.0) { //L2 00321 for (int c = 0; c < GradV.Len(); c++) { 00322 GradV[c] += 2 * RegCoef * GetCom(UID, CIDV[c]); 00323 } 00324 } 00325 00326 00327 for (int c = 0; c < GradV.Len(); c++) { 00328 if (GetCom(UID, CIDV[c]) == 0.0 && GradV[c] < 0.0) { continue; } 00329 if (fabs(GradV[c]) < 0.0001) { continue; } 00330 GradU.AddDat(CIDV[c], GradV[c]); 00331 } 00332 for (int c = 0; c < GradU.Len(); c++) { 00333 if (GradU[c] >= 10) { GradU[c] = 10; } 00334 if (GradU[c] <= -10) { GradU[c] = -10; } 00335 IAssert(GradU[c] >= -10); 00336 } 00337 } 00338 00339 double TAGMFast::GradientForOneVar(const TFltV& AlphaKV, const int UID, const int CID, const double& Val) { 00340 TUNGraph::TNodeI UI = G->GetNI(UID); 00341 double Grad = 0.0, PNoEdge; 00342 int VID = 0; 00343 for (int e = 0; e < UI.GetDeg(); e++) { 00344 VID = UI.GetNbrNId(e); 00345 if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } 00346 if (! F[VID].IsKey(CID)) { continue; } 00347 PNoEdge = AlphaKV[e] * exp (- F[VID].GetDat(CID) * Val); 00348 IAssert(PNoEdge <= 1.0 && PNoEdge >= 0.0); 00349 //PNoEdge = PNoEdge >= 1.0 - PNoCom? 1 - PNoCom: PNoEdge; 00350 Grad += ((PNoEdge * F[VID].GetDat(CID)) / (1.0 - PNoEdge) + NegWgt * F[VID].GetDat(CID)); 00351 } 00352 Grad -= NegWgt * (SumFV[CID] - GetCom(UID, CID)); 00353 //add regularization 00354 if (RegCoef > 0.0) { //L1 00355 Grad -= RegCoef; 00356 } 00357 if (RegCoef < 0.0) { //L2 00358 Grad += 2 * RegCoef * Val; 00359 } 00360 00361 return Grad; 00362 } 00363 00364 double TAGMFast::LikelihoodForOneVar(const TFltV& AlphaKV, const int UID, const int CID, const double& Val) { 00365 TUNGraph::TNodeI UI = G->GetNI(UID); 00366 double L = 0.0, PNoEdge; 00367 int VID = 0; 00368 for (int e = 0; e < UI.GetDeg(); e++) { 00369 VID = UI.GetNbrNId(e); 00370 if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } 00371 if (! F[VID].IsKey(CID)) { 00372 PNoEdge = AlphaKV[e]; 00373 } else { 00374 PNoEdge = AlphaKV[e] * exp (- F[VID].GetDat(CID) * Val); 00375 } 00376 IAssert(PNoEdge <= 1.0 && PNoEdge >= 0.0); 00377 //PNoEdge = PNoEdge >= 1.0 - PNoCom? 1 - PNoCom: PNoEdge; 00378 L += log(1.0 - PNoEdge) + NegWgt * GetCom(VID, CID) * Val; 00379 00380 // += ((PNoEdge * F[VID].GetDat(CID)) / (1.0 - PNoEdge) + NegWgt * F[VID].GetDat(CID)); 00381 } 00382 L -= NegWgt * (SumFV[CID] - GetCom(UID, CID)) * Val; 00383 //add regularization 00384 if (RegCoef > 0.0) { //L1 00385 L -= RegCoef * Val; 00386 } 00387 if (RegCoef < 0.0) { //L2 00388 L += RegCoef * Val * Val; 00389 } 00390 00391 return L; 00392 } 00393 00394 double TAGMFast::HessianForOneVar(const TFltV& AlphaKV, const int UID, const int CID, const double& Val) { 00395 TUNGraph::TNodeI UI = G->GetNI(UID); 00396 double H = 0.0, PNoEdge; 00397 int VID = 0; 00398 for (int e = 0; e < UI.GetDeg(); e++) { 00399 VID = UI.GetNbrNId(e); 00400 if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } 00401 if (! F[VID].IsKey(CID)) { continue; } 00402 PNoEdge = AlphaKV[e] * exp (- F[VID].GetDat(CID) * Val); 00403 IAssert(PNoEdge <= 1.0 && PNoEdge >= 0.0); 00404 //PNoEdge = PNoEdge == 1.0? 1 - PNoCom: PNoEdge; 00405 H += (- PNoEdge * F[VID].GetDat(CID) * F[VID].GetDat(CID)) / (1.0 - PNoEdge) / (1.0 - PNoEdge); 00406 } 00407 //add regularization 00408 if (RegCoef < 0.0) { //L2 00409 H += 2 * RegCoef; 00410 } 00411 IAssert (H <= 0.0); 00412 return H; 00413 } 00414 00416 int TAGMFast::MLENewton(const double& Thres, const int& MaxIter, const TStr PlotNm) { 00417 TExeTm ExeTm; 00418 int iter = 0, PrevIter = 0; 00419 TIntFltPrV IterLV; 00420 double PrevL = TFlt::Mn, CurL; 00421 TUNGraph::TNodeI UI; 00422 TIntV NIdxV; 00423 G->GetNIdV(NIdxV); 00424 int CID, UID, NewtonIter; 00425 double Fuc, PrevFuc, Grad, H; 00426 while(iter < MaxIter) { 00427 NIdxV.Shuffle(Rnd); 00428 for (int ui = 0; ui < F.Len(); ui++, iter++) { 00429 if (! PlotNm.Empty() && iter % G->GetNodes() == 0) { 00430 IterLV.Add(TIntFltPr(iter, Likelihood(false))); 00431 } 00432 UID = NIdxV[ui]; 00433 //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) 00434 TIntSet CIDSet; 00435 UI = G->GetNI(UID); 00436 if (UI.GetDeg() == 0) { //if the node is isolated, clear its membership and skip 00437 if (! F[UID].Empty()) { F[UID].Clr(); } 00438 continue; 00439 } 00440 for (int e = 0; e < UI.GetDeg(); e++) { 00441 if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } 00442 TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)]; 00443 for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { 00444 CIDSet.AddKey(CI.GetKey()); 00445 } 00446 } 00447 for (TIntFltH::TIter CI = F[UID].BegI(); CI < F[UID].EndI(); CI++) { //remove the community membership which U does not share with its neighbors 00448 if (! CIDSet.IsKey(CI.GetKey())) { 00449 DelCom(UID, CI.GetKey()); 00450 } 00451 } 00452 if (CIDSet.Empty()) { continue; } 00453 for (TIntSet::TIter CI = CIDSet.BegI(); CI < CIDSet.EndI(); CI++) { 00454 CID = CI.GetKey(); 00455 //optimize for UID, CID 00456 //compute constants 00457 TFltV AlphaKV(UI.GetDeg()); 00458 for (int e = 0; e < UI.GetDeg(); e++) { 00459 if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; } 00460 AlphaKV[e] = (1 - PNoCom) * exp(- DotProduct(UID, UI.GetNbrNId(e)) + GetCom(UI.GetNbrNId(e), CID) * GetCom(UID, CID)); 00461 IAssertR(AlphaKV[e] <= 1.0, TStr::Fmt("AlphaKV=%f, %f, %f", AlphaKV[e].Val, PNoCom.Val, GetCom(UI.GetNbrNId(e), CID))); 00462 } 00463 Fuc = GetCom(UID, CID); 00464 PrevFuc = Fuc; 00465 Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0; 00466 if (Grad <= 1e-3 && Grad >= -0.1) { continue; } 00467 NewtonIter = 0; 00468 while (NewtonIter++ < 10) { 00469 Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0; 00470 H = HessianForOneVar(AlphaKV, UID, CID, Fuc); 00471 if (Fuc == 0.0 && Grad <= 0.0) { Grad = 0.0; } 00472 if (fabs(Grad) < 1e-3) { break; } 00473 if (H == 0.0) { Fuc = 0.0; break; } 00474 double NewtonStep = - Grad / H; 00475 if (NewtonStep < -0.5) { NewtonStep = - 0.5; } 00476 Fuc += NewtonStep; 00477 if (Fuc < 0.0) { Fuc = 0.0; } 00478 } 00479 if (Fuc == 0.0) { 00480 DelCom(UID, CID); 00481 } 00482 else { 00483 AddCom(UID, CID, Fuc); 00484 } 00485 } 00486 } 00487 if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) { 00488 PrevIter = iter; 00489 CurL = Likelihood(); 00490 if (PrevL > TFlt::Mn && ! PlotNm.Empty()) { 00491 printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL, CurL - PrevL); 00492 } 00493 fflush(stdout); 00494 if (CurL - PrevL <= Thres * fabs(PrevL)) { break; } 00495 else { PrevL = CurL; } 00496 } 00497 00498 } 00499 if (! PlotNm.Empty()) { 00500 printf("\nMLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr()); 00501 TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); 00502 } 00503 return iter; 00504 } 00505 00506 void TAGMFast::GetCmtyVV(TVec<TIntV>& CmtyVV) { 00507 GetCmtyVV(CmtyVV, sqrt(2.0 * (double) G->GetEdges() / G->GetNodes() / G->GetNodes()), 3); 00508 } 00509 00511 void TAGMFast::GetCmtyVV(TVec<TIntV>& CmtyVV, const double Thres, const int MinSz) { 00512 CmtyVV.Gen(NumComs, 0); 00513 TIntFltH CIDSumFH(NumComs); 00514 for (int c = 0; c < SumFV.Len(); c++) { 00515 CIDSumFH.AddDat(c, SumFV[c]); 00516 } 00517 CIDSumFH.SortByDat(false); 00518 for (int c = 0; c < NumComs; c++) { 00519 int CID = CIDSumFH.GetKey(c); 00520 TIntFltH NIDFucH(F.Len() / 10); 00521 TIntV CmtyV; 00522 IAssert(SumFV[CID] == CIDSumFH.GetDat(CID)); 00523 if (SumFV[CID] < Thres) { continue; } 00524 for (int u = 0; u < F.Len(); u++) { 00525 int NID = u; 00526 if (! NodesOk) { NID = NIDV[u]; } 00527 if (GetCom(u, CID) >= Thres) { NIDFucH.AddDat(NID, GetCom(u, CID)); } 00528 } 00529 NIDFucH.SortByDat(false); 00530 NIDFucH.GetKeyV(CmtyV); 00531 if (CmtyV.Len() >= MinSz) { CmtyVV.Add(CmtyV); } 00532 } 00533 if ( NumComs != CmtyVV.Len()) { 00534 printf("Community vector generated. %d communities are ommitted\n", NumComs.Val - CmtyVV.Len()); 00535 } 00536 } 00537 00539 int TAGMFast::FindComsByCV(const int NumThreads, const int MaxComs, const int MinComs, const int DivComs, const TStr OutFNm, const double StepAlpha, const double StepBeta) { 00540 double ComsGap = exp(TMath::Log((double) MaxComs / (double) MinComs) / (double) DivComs); 00541 TIntV ComsV; 00542 ComsV.Add(MinComs); 00543 while (ComsV.Len() < DivComs) { 00544 int NewComs = int(ComsV.Last() * ComsGap); 00545 if (NewComs == ComsV.Last().Val) { NewComs++; } 00546 ComsV.Add(NewComs); 00547 } 00548 if (ComsV.Last() < MaxComs) { ComsV.Add(MaxComs); } 00549 return FindComsByCV(ComsV, 0.1, NumThreads, OutFNm + ".CV.likelihood", StepAlpha, StepBeta); 00550 } 00551 00552 int TAGMFast::FindComsByCV(TIntV& ComsV, const double HOFrac, const int NumThreads, const TStr PlotLFNm, const double StepAlpha, const double StepBeta) { 00553 if (ComsV.Len() == 0) { 00554 int MaxComs = G->GetNodes() / 5; 00555 ComsV.Add(2); 00556 while(ComsV.Last() < MaxComs) { ComsV.Add(ComsV.Last() * 2); } 00557 } 00558 TIntPrV EdgeV(G->GetEdges(), 0); 00559 for (TUNGraph::TEdgeI EI = G->BegEI(); EI < G->EndEI(); EI++) { 00560 EdgeV.Add(TIntPr(EI.GetSrcNId(), EI.GetDstNId())); 00561 } 00562 EdgeV.Shuffle(Rnd); 00563 int MaxIterCV = 3; 00564 00565 TVec<TVec<TIntSet> > HoldOutSets(MaxIterCV); 00566 if (EdgeV.Len() > 50) { //if edges are many enough, use CV 00567 printf("generating hold out set\n"); 00568 TIntV NIdV1, NIdV2; 00569 G->GetNIdV(NIdV1); 00570 G->GetNIdV(NIdV2); 00571 for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) { 00572 // generate holdout sets 00573 HoldOutSets[IterCV].Gen(G->GetNodes()); 00574 const int HOTotal = int(HOFrac * G->GetNodes() * (G->GetNodes() - 1) / 2.0); 00575 int HOCnt = 0; 00576 int HOEdges = (int) TMath::Round(HOFrac * G->GetEdges()); 00577 printf("holding out %d edges...\n", HOEdges); 00578 for (int he = 0; he < (int) HOEdges; he++) { 00579 HoldOutSets[IterCV][EdgeV[he].Val1].AddKey(EdgeV[he].Val2); 00580 HoldOutSets[IterCV][EdgeV[he].Val2].AddKey(EdgeV[he].Val1); 00581 HOCnt++; 00582 } 00583 printf("%d Edges hold out\n", HOCnt); 00584 while(HOCnt++ < HOTotal) { 00585 int SrcNID = Rnd.GetUniDevInt(G->GetNodes()); 00586 int DstNID = Rnd.GetUniDevInt(G->GetNodes()); 00587 HoldOutSets[IterCV][SrcNID].AddKey(DstNID); 00588 HoldOutSets[IterCV][DstNID].AddKey(SrcNID); 00589 } 00590 } 00591 printf("hold out set generated\n"); 00592 } 00593 00594 TFltV HOLV(ComsV.Len()); 00595 TIntFltPrV ComsLV; 00596 for (int c = 0; c < ComsV.Len(); c++) { 00597 const int Coms = ComsV[c]; 00598 printf("Try number of Coms:%d\n", Coms); 00599 NeighborComInit(Coms); 00600 printf("Initialized\n"); 00601 00602 if (EdgeV.Len() > 50) { //if edges are many enough, use CV 00603 for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) { 00604 HOVIDSV = HoldOutSets[IterCV]; 00605 00606 if (NumThreads == 1) { 00607 printf("MLE without parallelization begins\n"); 00608 MLEGradAscent(0.05, 10 * G->GetNodes(), "", StepAlpha, StepBeta); 00609 } else { 00610 printf("MLE with parallelization begins\n"); 00611 MLEGradAscentParallel(0.05, 100, NumThreads, "", StepAlpha, StepBeta); 00612 } 00613 double HOL = LikelihoodHoldOut(); 00614 HOL = HOL < 0? HOL: TFlt::Mn; 00615 HOLV[c] += HOL; 00616 } 00617 } 00618 else { 00619 HOVIDSV.Gen(G->GetNodes()); 00620 MLEGradAscent(0.0001, 100 * G->GetNodes(), ""); 00621 double BIC = 2 * Likelihood() - (double) G->GetNodes() * Coms * 2.0 * log ( (double) G->GetNodes()); 00622 HOLV[c] = BIC; 00623 } 00624 } 00625 int EstComs = 2; 00626 double MaxL = TFlt::Mn; 00627 printf("\n"); 00628 for (int c = 0; c < ComsV.Len(); c++) { 00629 ComsLV.Add(TIntFltPr(ComsV[c].Val, HOLV[c].Val)); 00630 printf("%d(%f)\t", ComsV[c].Val, HOLV[c].Val); 00631 if (MaxL < HOLV[c]) { 00632 MaxL = HOLV[c]; 00633 EstComs = ComsV[c]; 00634 } 00635 } 00636 printf("\n"); 00637 RandomInit(EstComs); 00638 HOVIDSV.Gen(G->GetNodes()); 00639 if (! PlotLFNm.Empty()) { 00640 TGnuPlot::PlotValV(ComsLV, PlotLFNm, "hold-out likelihood", "communities", "likelihood"); 00641 } 00642 return EstComs; 00643 } 00644 00645 double TAGMFast::LikelihoodHoldOut(const bool DoParallel) { 00646 double L = 0.0; 00647 for (int u = 0; u < HOVIDSV.Len(); u++) { 00648 for (int e = 0; e < HOVIDSV[u].Len(); e++) { 00649 int VID = HOVIDSV[u][e]; 00650 if (VID == u) { continue; } 00651 double Pred = Prediction(u, VID); 00652 if (G->IsEdge(u, VID)) { 00653 L += log(1.0 - Pred); 00654 } 00655 else { 00656 L += NegWgt * log(Pred); 00657 } 00658 } 00659 } 00660 return L; 00661 } 00662 00663 double TAGMFast::GetStepSizeByLineSearch(const int UID, const TIntFltH& DeltaV, const TIntFltH& GradV, const double& Alpha, const double& Beta, const int MaxIter) { 00664 double StepSize = 1.0; 00665 double InitLikelihood = LikelihoodForRow(UID); 00666 TIntFltH NewVarV(DeltaV.Len()); 00667 for(int iter = 0; iter < MaxIter; iter++) { 00668 for (int i = 0; i < DeltaV.Len(); i++){ 00669 int CID = DeltaV.GetKey(i); 00670 double NewVal = GetCom(UID, CID) + StepSize * DeltaV.GetDat(CID); 00671 if (NewVal < MinVal) { NewVal = MinVal; } 00672 if (NewVal > MaxVal) { NewVal = MaxVal; } 00673 NewVarV.AddDat(CID, NewVal); 00674 } 00675 if (LikelihoodForRow(UID, NewVarV) < InitLikelihood + Alpha * StepSize * DotProduct(GradV, DeltaV)) { 00676 StepSize *= Beta; 00677 } else { 00678 break; 00679 } 00680 if (iter == MaxIter - 1) { 00681 StepSize = 0.0; 00682 break; 00683 } 00684 } 00685 return StepSize; 00686 } 00687 00688 int TAGMFast::MLEGradAscent(const double& Thres, const int& MaxIter, const TStr PlotNm, const double StepAlpha, const double StepBeta) { 00689 time_t InitTime = time(NULL); 00690 TExeTm ExeTm, CheckTm; 00691 int iter = 0, PrevIter = 0; 00692 TIntFltPrV IterLV; 00693 TUNGraph::TNodeI UI; 00694 double PrevL = TFlt::Mn, CurL = 0.0; 00695 TIntV NIdxV(F.Len(), 0); 00696 for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); } 00697 IAssert(NIdxV.Len() == F.Len()); 00698 TIntFltH GradV; 00699 while(iter < MaxIter) { 00700 NIdxV.Shuffle(Rnd); 00701 for (int ui = 0; ui < F.Len(); ui++, iter++) { 00702 int u = NIdxV[ui]; // 00703 //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) 00704 UI = G->GetNI(u); 00705 TIntSet CIDSet(5 * UI.GetDeg()); 00706 for (int e = 0; e < UI.GetDeg(); e++) { 00707 if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; } 00708 TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)]; 00709 for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { 00710 CIDSet.AddKey(CI.GetKey()); 00711 } 00712 } 00713 for (TIntFltH::TIter CI = F[u].BegI(); CI < F[u].EndI(); CI++) { //remove the community membership which U does not share with its neighbors 00714 if (! CIDSet.IsKey(CI.GetKey())) { 00715 DelCom(u, CI.GetKey()); 00716 } 00717 } 00718 if (CIDSet.Empty()) { continue; } 00719 GradientForRow(u, GradV, CIDSet); 00720 if (Norm2(GradV) < 1e-4) { continue; } 00721 double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta); 00722 if (LearnRate == 0.0) { continue; } 00723 for (int ci = 0; ci < GradV.Len(); ci++) { 00724 int CID = GradV.GetKey(ci); 00725 double Change = LearnRate * GradV.GetDat(CID); 00726 double NewFuc = GetCom(u, CID) + Change; 00727 if (NewFuc <= 0.0) { 00728 DelCom(u, CID); 00729 } else { 00730 AddCom(u, CID, NewFuc); 00731 } 00732 } 00733 if (! PlotNm.Empty() && (iter + 1) % G->GetNodes() == 0) { 00734 IterLV.Add(TIntFltPr(iter, Likelihood(false))); 00735 } 00736 } 00737 printf("\r%d iterations (%f) [%lu sec]", iter, CurL, time(NULL) - InitTime); 00738 fflush(stdout); 00739 if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) { 00740 PrevIter = iter; 00741 CurL = Likelihood(); 00742 if (PrevL > TFlt::Mn && ! PlotNm.Empty()) { 00743 printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL, CurL - PrevL); 00744 } 00745 fflush(stdout); 00746 if (CurL - PrevL <= Thres * fabs(PrevL)) { break; } 00747 else { PrevL = CurL; } 00748 } 00749 00750 } 00751 printf("\n"); 00752 printf("MLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr()); 00753 if (! PlotNm.Empty()) { 00754 TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); 00755 } 00756 return iter; 00757 } 00758 00759 int TAGMFast::MLEGradAscentParallel(const double& Thres, const int& MaxIter, const int ChunkNum, const int ChunkSize, const TStr PlotNm, const double StepAlpha, const double StepBeta) { 00760 //parallel 00761 time_t InitTime = time(NULL); 00762 uint64 StartTm = TSecTm::GetCurTm().GetAbsSecs(); 00763 TExeTm ExeTm, CheckTm; 00764 double PrevL = Likelihood(true); 00765 TIntFltPrV IterLV; 00766 int PrevIter = 0; 00767 int iter = 0; 00768 TIntV NIdxV(F.Len(), 0); 00769 for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); } 00770 TIntV NIDOPTV(F.Len()); //check if a node needs optimization or not 1: does not require optimization 00771 NIDOPTV.PutAll(0); 00772 TVec<TIntFltH> NewF(ChunkNum * ChunkSize); 00773 TIntV NewNIDV(ChunkNum * ChunkSize); 00774 for (iter = 0; iter < MaxIter; iter++) { 00775 NIdxV.Clr(false); 00776 for (int i = 0; i < F.Len(); i++) { 00777 if (NIDOPTV[i] == 0) { NIdxV.Add(i); } 00778 } 00779 IAssert (NIdxV.Len() <= F.Len()); 00780 NIdxV.Shuffle(Rnd); 00781 // compute gradient for chunk of nodes 00782 #pragma omp parallel for schedule(static, 1) 00783 for (int TIdx = 0; TIdx < ChunkNum; TIdx++) { 00784 TIntFltH GradV; 00785 for (int ui = TIdx * ChunkSize; ui < (TIdx + 1) * ChunkSize; ui++) { 00786 NewNIDV[ui] = -1; 00787 if (ui > NIdxV.Len()) { continue; } 00788 int u = NIdxV[ui]; // 00789 //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) 00790 TUNGraph::TNodeI UI = G->GetNI(u); 00791 TIntSet CIDSet(5 * UI.GetDeg()); 00792 TIntFltH CurFU = F[u]; 00793 for (int e = 0; e < UI.GetDeg(); e++) { 00794 if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; } 00795 TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)]; 00796 for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { 00797 CIDSet.AddKey(CI.GetKey()); 00798 } 00799 } 00800 if (CIDSet.Empty()) { 00801 CurFU.Clr(); 00802 } 00803 else { 00804 for (TIntFltH::TIter CI = CurFU.BegI(); CI < CurFU.EndI(); CI++) { //remove the community membership which U does not share with its neighbors 00805 if (! CIDSet.IsKey(CI.GetKey())) { 00806 CurFU.DelIfKey(CI.GetKey()); 00807 } 00808 } 00809 GradientForRow(u, GradV, CIDSet); 00810 if (Norm2(GradV) < 1e-4) { NIDOPTV[u] = 1; continue; } 00811 double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta, 5); 00812 if (LearnRate <= 1e-5) { NewNIDV[ui] = -2; continue; } 00813 for (int ci = 0; ci < GradV.Len(); ci++) { 00814 int CID = GradV.GetKey(ci); 00815 double Change = LearnRate * GradV.GetDat(CID); 00816 double NewFuc = CurFU.IsKey(CID)? CurFU.GetDat(CID) + Change : Change; 00817 if (NewFuc <= 0.0) { 00818 CurFU.DelIfKey(CID); 00819 } else { 00820 CurFU.AddDat(CID) = NewFuc; 00821 } 00822 } 00823 CurFU.Defrag(); 00824 } 00825 //store changes 00826 NewF[ui] = CurFU; 00827 NewNIDV[ui] = u; 00828 } 00829 } 00830 int NumNoChangeGrad = 0; 00831 int NumNoChangeStepSize = 0; 00832 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 00833 int NewNID = NewNIDV[ui]; 00834 if (NewNID == -1) { NumNoChangeGrad++; continue; } 00835 if (NewNID == -2) { NumNoChangeStepSize++; continue; } 00836 for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) { 00837 SumFV[CI.GetKey()] -= CI.GetDat(); 00838 } 00839 } 00840 #pragma omp parallel for 00841 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 00842 int NewNID = NewNIDV[ui]; 00843 if (NewNID < 0) { continue; } 00844 F[NewNID] = NewF[ui]; 00845 } 00846 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 00847 int NewNID = NewNIDV[ui]; 00848 if (NewNID < 0) { continue; } 00849 for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) { 00850 SumFV[CI.GetKey()] += CI.GetDat(); 00851 } 00852 } 00853 // update the nodes who are optimal 00854 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 00855 int NewNID = NewNIDV[ui]; 00856 if (NewNID < 0) { continue; } 00857 TUNGraph::TNodeI UI = G->GetNI(NewNID); 00858 NIDOPTV[NewNID] = 0; 00859 for (int e = 0; e < UI.GetDeg(); e++) { 00860 NIDOPTV[UI.GetNbrNId(e)] = 0; 00861 } 00862 } 00863 int OPTCnt = 0; 00864 for (int i = 0; i < NIDOPTV.Len(); i++) { if (NIDOPTV[i] == 1) { OPTCnt++; } } 00865 if (! PlotNm.Empty()) { 00866 printf("\r%d iterations [%s] %d secs", iter * ChunkSize * ChunkNum, ExeTm.GetTmStr(), int(TSecTm::GetCurTm().GetAbsSecs() - StartTm)); 00867 if (PrevL > TFlt::Mn) { printf(" (%f) %d g %d s %d OPT", PrevL, NumNoChangeGrad, NumNoChangeStepSize, OPTCnt); } 00868 fflush(stdout); 00869 } 00870 if ((iter - PrevIter) * ChunkSize * ChunkNum >= G->GetNodes()) { 00871 PrevIter = iter; 00872 double CurL = Likelihood(true); 00873 IterLV.Add(TIntFltPr(iter * ChunkSize * ChunkNum, CurL)); 00874 printf("\r%d iterations, Likelihood: %f, Diff: %f [%d secs]", iter, CurL, CurL - PrevL, int(time(NULL) - InitTime)); 00875 fflush(stdout); 00876 if (CurL - PrevL <= Thres * fabs(PrevL)) { 00877 break; 00878 } 00879 else { 00880 PrevL = CurL; 00881 } 00882 } 00883 } 00884 if (! PlotNm.Empty()) { 00885 printf("\nMLE completed with %d iterations(%d secs)\n", iter, int(TSecTm::GetCurTm().GetAbsSecs() - StartTm)); 00886 TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); 00887 } else { 00888 printf("\rMLE completed with %d iterations(%d secs)", iter, int(time(NULL) - InitTime)); 00889 fflush(stdout); 00890 } 00891 return iter; 00892 }