SNAP Library 2.2, User Reference
2014-03-11 19:15:55
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
|
00001 00002 #include "stdafx.h" 00003 #include "agmfast.h" 00004 #include "agmdirected.h" 00005 #include "Snap.h" 00006 #include "agm.h" 00007 00008 void TCoda::Save(TSOut& SOut) { 00009 G->Save(SOut); 00010 F.Save(SOut); 00011 H.Save(SOut); 00012 NIDV.Save(SOut); 00013 RegCoef.Save(SOut); 00014 SumFV.Save(SOut); 00015 SumHV.Save(SOut); 00016 NodesOk.Save(SOut); 00017 MinVal.Save(SOut); 00018 MaxVal.Save(SOut); 00019 NegWgt.Save(SOut); 00020 NumComs.Save(SOut); 00021 HOVIDSV.Save(SOut); 00022 PNoCom.Save(SOut); 00023 } 00024 00025 void TCoda::Load(TSIn& SIn, const int& RndSeed) { 00026 G->Load(SIn); 00027 F.Load(SIn); 00028 H.Load(SIn); 00029 NIDV.Load(SIn); 00030 RegCoef.Load(SIn); 00031 SumFV.Load(SIn); 00032 SumHV.Load(SIn); 00033 NodesOk.Load(SIn); 00034 MinVal.Load(SIn); 00035 MaxVal.Load(SIn); 00036 NegWgt.Load(SIn); 00037 NumComs.Load(SIn); 00038 HOVIDSV.Load(SIn); 00039 PNoCom.Load(SIn); 00040 Rnd.PutSeed(RndSeed); 00041 } 00042 00043 void TCoda::RandomInit(const int InitComs) { 00044 F.Gen(G->GetNodes()); 00045 H.Gen(G->GetNodes()); 00046 SumFV.Gen(InitComs); 00047 SumHV.Gen(InitComs); 00048 NumComs = InitComs; 00049 for (int u = 0; u < F.Len(); u++) { 00050 //assign to just one community 00051 int Mem = G->GetNI(u).GetOutDeg(); 00052 if (Mem > 10) { Mem = 10; } 00053 for (int c = 0; c < Mem; c++) { 00054 int CID = Rnd.GetUniDevInt(InitComs); 00055 AddComOut(u, CID, Rnd.GetUniDev()); 00056 } 00057 } 00058 for (int u = 0; u < H.Len(); u++) { 00059 //assign to just one community 00060 int Mem = G->GetNI(u).GetInDeg(); 00061 if (Mem > 10) { Mem = 10; } 00062 for (int c = 0; c < Mem; c++) { 00063 int CID = Rnd.GetUniDevInt(InitComs); 00064 AddComIn(u, CID, Rnd.GetUniDev()); 00065 } 00066 } 00067 //assign a member to zero-member community (if any) 00068 for (int c = 0; c < SumFV.Len(); c++) { 00069 if (SumFV[c] == 0.0) { 00070 int UID = Rnd.GetUniDevInt(G->GetNodes()); 00071 AddComOut(UID, c, Rnd.GetUniDev()); 00072 } 00073 } 00074 //assign a member to zero-member community (if any) 00075 for (int c = 0; c < SumHV.Len(); c++) { 00076 if (SumHV[c] == 0.0) { 00077 int UID = Rnd.GetUniDevInt(G->GetNodes()); 00078 AddComIn(UID, c, Rnd.GetUniDev()); 00079 } 00080 } 00081 } 00082 00083 void TCoda::NeighborComInit(const int InitComs) { 00084 //initialize with best neighborhood communities (Gleich et.al. KDD'12) 00085 TExeTm RunTm; 00086 TFltIntPrV NIdPhiV(F.Len(), 0); 00087 TAGMFastUtil::GetNIdPhiV<PNGraph>(G, NIdPhiV); 00088 NeighborComInit(NIdPhiV, InitComs); 00089 } 00090 00091 void TCoda::NeighborComInit(TFltIntPrV& NIdPhiV, const int InitComs) { 00092 NIdPhiV.Sort(true); 00093 F.Gen(G->GetNodes()); 00094 H.Gen(G->GetNodes()); 00095 SumFV.Gen(InitComs); 00096 SumHV.Gen(InitComs); 00097 NumComs = InitComs; 00098 //TIntFltH NCPhiH(F.Len()); 00099 TIntSet InvalidNIDS(F.Len()); 00100 TIntV ChosenNIDV(InitComs, 0); //FOR DEBUG 00101 //choose nodes with local minimum in conductance 00102 int CurCID = 0; 00103 for (int ui = 0; ui < NIdPhiV.Len(); ui++) { 00104 int UID = NIdPhiV[ui].Val2; 00105 fflush(stdout); 00106 if (InvalidNIDS.IsKey(UID)) { continue; } 00107 ChosenNIDV.Add(UID); //FOR DEBUG 00108 //add the node and its neighbors to the current community 00109 TNGraph::TNodeI NI = G->GetNI(UID); 00110 if (NI.GetOutDeg() > 0) { AddComOut(UID, CurCID, 1.0); } 00111 if (NI.GetInDeg() > 0) { AddComIn(UID, CurCID, 1.0); } 00112 fflush(stdout); 00113 //add neighbors depending on whether they have incoming / outgoing edges from the center node (NI) 00114 for (int e = 0; e < NI.GetDeg(); e++) { 00115 int VID = NI.GetNbrNId(e); 00116 TNGraph::TNodeI VI = G->GetNI(VID); 00117 if (VI.GetOutDeg() > 0) { AddComOut(VID, CurCID, 1.0); } 00118 if (VI.GetInDeg() > 0) { AddComIn(VID, CurCID, 1.0); } 00119 } 00120 //exclude its neighbors from the next considerations 00121 for (int e = 0; e < NI.GetDeg(); e++) { 00122 InvalidNIDS.AddKey(NI.GetNbrNId(e)); 00123 } 00124 CurCID++; 00125 fflush(stdout); 00126 if (CurCID >= NumComs) { break; } 00127 } 00128 if (NumComs > CurCID) { 00129 printf("%d communities needed to fill randomly\n", NumComs - CurCID); 00130 } 00131 //assign a member to zero-member community (if any) 00132 for (int c = 0; c < SumFV.Len(); c++) { 00133 if (SumFV[c] == 0.0) { 00134 int ComSz = 10; 00135 for (int u = 0; u < ComSz; u++) { 00136 int UID = Rnd.GetUniDevInt(G->GetNodes()); 00137 AddComOut(UID, c, Rnd.GetUniDev()); 00138 } 00139 } 00140 } 00141 //assign a member to zero-member community (if any) 00142 for (int c = 0; c < SumHV.Len(); c++) { 00143 if (SumHV[c] == 0.0) { 00144 int ComSz = 10; 00145 for (int u = 0; u < ComSz; u++) { 00146 int UID = Rnd.GetUniDevInt(G->GetNodes()); 00147 AddComIn(UID, c, Rnd.GetUniDev()); 00148 } 00149 } 00150 } 00151 } 00152 00153 void TCoda::GetNonEdgePairScores(TFltIntIntTrV& ScoreV) { 00154 ScoreV.Gen(G->GetNodes() * G->GetNodes(), 0); 00155 TIntV NIDV; 00156 G->GetNIdV(NIDV); 00157 TIntSet Cuv; 00158 for (int u = 0; u < NIDV.Len(); u++) { 00159 int UID = NIDV[u]; 00160 for (int v = 0; v < NIDV.Len(); v++) { 00161 int VID = NIDV[v]; 00162 if (UID == VID) { continue; } 00163 if (! G->IsEdge(UID, VID)) { 00164 double Val = 1.0 - Prediction(UID, VID); 00165 ScoreV.Add(TFltIntIntTr(Val, UID, VID)); 00166 } 00167 } 00168 } 00169 } 00170 00171 void TCoda::SetCmtyVV(const TVec<TIntV>& CmtyVVOut, const TVec<TIntV>& CmtyVVIn) { 00172 IAssert(CmtyVVOut.Len() == CmtyVVIn.Len()); 00173 F.Gen(G->GetNodes()); 00174 H.Gen(G->GetNodes()); 00175 SumFV.Gen(CmtyVVOut.Len()); 00176 SumHV.Gen(CmtyVVIn.Len()); 00177 NumComs = CmtyVVOut.Len(); 00178 TIntH NIDIdxH(NIDV.Len()); 00179 if (! NodesOk) { 00180 for (int u = 0; u < NIDV.Len(); u++) { 00181 NIDIdxH.AddDat(NIDV[u], u); 00182 } 00183 } 00184 for (int c = 0; c < CmtyVVOut.Len(); c++) { 00185 for (int u = 0; u < CmtyVVOut[c].Len(); u++) { 00186 int UID = CmtyVVOut[c][u]; 00187 if (! NodesOk) { UID = NIDIdxH.GetDat(UID); } 00188 if (G->IsNode(UID)) { 00189 AddComOut(UID, c, 1.0); 00190 } 00191 } 00192 } 00193 for (int c = 0; c < CmtyVVIn.Len(); c++) { 00194 for (int u = 0; u < CmtyVVIn[c].Len(); u++) { 00195 int UID = CmtyVVIn[c][u]; 00196 if (! NodesOk) { UID = NIDIdxH.GetDat(UID); } 00197 if (G->IsNode(UID)) { 00198 AddComIn(UID, c, 1.0); 00199 } 00200 } 00201 } 00202 } 00203 00204 void TCoda::SetGraph(const PNGraph& GraphPt) { 00205 G = GraphPt; 00206 HOVIDSV.Gen(G->GetNodes()); 00207 NodesOk = true; 00208 GraphPt->GetNIdV(NIDV); 00209 // check that nodes IDs are {0,1,..,Nodes-1} 00210 for (int nid = 0; nid < GraphPt->GetNodes(); nid++) { 00211 if (! GraphPt->IsNode(nid)) { 00212 NodesOk = false; 00213 break; 00214 } 00215 } 00216 if (! NodesOk) { 00217 printf("rearrage nodes\n"); 00218 G = TSnap::GetSubGraph(GraphPt, NIDV, true); 00219 for (int nid = 0; nid < G->GetNodes(); nid++) { 00220 IAssert(G->IsNode(nid)); 00221 } 00222 } 00223 TSnap::DelSelfEdges(G); 00224 00225 PNoCom = 1.0 / (double) G->GetNodes(); 00226 DoParallel = false; 00227 if (1.0 / PNoCom > sqrt(TFlt::Mx)) { PNoCom = 0.99 / sqrt(TFlt::Mx); } // to prevent overflow 00228 NegWgt = 1.0; 00229 } 00230 00231 double TCoda::Likelihood(const bool _DoParallel) { 00232 TExeTm ExeTm; 00233 double L = 0.0; 00234 if (_DoParallel) { 00235 #pragma omp parallel for 00236 for (int u = 0; u < F.Len(); u++) { 00237 double LU = LikelihoodForNode(true, u); 00238 #pragma omp atomic 00239 L += LU; 00240 } 00241 } 00242 else { 00243 for (int u = 0; u < F.Len(); u++) { 00244 double LU = LikelihoodForNode(true, u); 00245 L += LU; 00246 } 00247 } 00248 return L; 00249 } 00250 00251 double TCoda::LikelihoodForNode(const bool IsRow, const int UID) { 00252 if (IsRow) { 00253 return LikelihoodForNode(IsRow, UID, F[UID]); 00254 } else { 00255 return LikelihoodForNode(IsRow, UID, H[UID]); 00256 } 00257 } 00258 00259 double TCoda::LikelihoodForNode(const bool IsRow, const int UID, const TIntFltH& FU) { 00260 double L = 0.0; 00261 TFltV HOSumHV; //adjust for Hv of v hold out 00262 if (HOVIDSV[UID].Len() > 0) { 00263 HOSumHV.Gen(NumComs); 00264 for (int e = 0; e < HOVIDSV[UID].Len(); e++) { 00265 for (int c = 0; c < SumHV.Len(); c++) { 00266 HOSumHV[c] += GetCom(! IsRow, HOVIDSV[UID][e], c); 00267 } 00268 } 00269 } 00270 TNGraph::TNodeI NI = G->GetNI(UID); 00271 const int Deg = IsRow ? NI.GetOutDeg(): NI.GetInDeg(); 00272 for (int e = 0; e < Deg; e++) { 00273 const int v = IsRow ? NI.GetOutNId(e): NI.GetInNId(e); 00274 if (v == UID) { continue; } 00275 if (HOVIDSV[UID].IsKey(v)) { continue; } 00276 if (IsRow) { 00277 L += log (1.0 - Prediction(FU, H[v])) + NegWgt * DotProduct(FU, H[v]); 00278 } else { 00279 L += log (1.0 - Prediction(F[v], FU)) + NegWgt * DotProduct(F[v], FU); 00280 } 00281 } 00282 for (TIntFltH::TIter HI = FU.BegI(); HI < FU.EndI(); HI++) { 00283 double HOSum = HOVIDSV[UID].Len() > 0? HOSumHV[HI.GetKey()].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist 00284 L -= NegWgt * (GetSumVal(! IsRow, HI.GetKey()) - HOSum - GetCom(! IsRow, UID, HI.GetKey())) * HI.GetDat(); 00285 } 00286 //add regularization 00287 if (RegCoef > 0.0) { //L1 00288 L -= RegCoef * Sum(FU); 00289 } 00290 if (RegCoef < 0.0) { //L2 00291 L += RegCoef * Norm2(FU); 00292 } 00293 return L; 00294 } 00295 00296 00297 /* 00298 double TCoda::LikelihoodForRow(const int UID) { 00299 return LikelihoodForRow(UID, F[UID]); 00300 } 00301 00302 00303 double TCoda::LikelihoodForRow(const int UID, const TIntFltH& FU) { 00304 double L = 0.0; 00305 TFltV HOSumHV; //adjust for Hv of v hold out 00306 if (HOVIDSV[UID].Len() > 0) { 00307 HOSumHV.Gen(SumFV.Len()); 00308 00309 for (int e = 0; e < HOVIDSV[UID].Len(); e++) { 00310 for (int c = 0; c < SumHV.Len(); c++) { 00311 HOSumHV[c] += GetComIn(HOVIDSV[UID][e], c); 00312 } 00313 } 00314 } 00315 TNGraph::TNodeI NI = G->GetNI(UID); 00316 for (int e = 0; e < NI.GetOutDeg(); e++) { 00317 int v = NI.GetOutNId(e); 00318 if (v == UID) { continue; } 00319 if (HOVIDSV[UID].IsKey(v)) { continue; } 00320 double LU = log (1.0 - Prediction(FU, H[v])) + NegWgt * DotProduct(FU, H[v]); 00321 L += LU; 00322 } 00323 for (TIntFltH::TIter HI = FU.BegI(); HI < FU.EndI(); HI++) { 00324 double HOSum = HOVIDSV[UID].Len() > 0? HOSumHV[HI.GetKey()].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist 00325 double LU = NegWgt * (SumHV[HI.GetKey()] - HOSum - GetComIn(UID, HI.GetKey())) * HI.GetDat(); 00326 L -= LU; 00327 } 00328 //add regularization 00329 if (RegCoef > 0.0) { //L1 00330 L -= RegCoef * Sum(FU); 00331 } 00332 if (RegCoef < 0.0) { //L2 00333 L += RegCoef * Norm2(FU); 00334 } 00335 00336 return L; 00337 } 00338 00339 double TCoda::LikelihoodForCol(const int VID) { 00340 return LikelihoodForCol(VID, H[VID]); 00341 } 00342 00343 00344 double TCoda::LikelihoodForCol(const int VID, const TIntFltH& HV) { 00345 double L = 0.0; 00346 TFltV HOSumFV; //adjust for Fv of v hold out 00347 if (HOVIDSV[VID].Len() > 0) { 00348 HOSumFV.Gen(SumFV.Len()); 00349 for (int e = 0; e < HOVIDSV[VID].Len(); e++) { 00350 for (int c = 0; c < SumFV.Len(); c++) { 00351 HOSumFV[c] += GetComOut(HOVIDSV[VID][e], c); 00352 } 00353 } 00354 } 00355 TNGraph::TNodeI NI = G->GetNI(VID); 00356 for (int e = 0; e < NI.GetInDeg(); e++) { 00357 int v = NI.GetInNId(e); 00358 if (v == VID) { continue; } 00359 if (HOVIDSV[VID].IsKey(v)) { continue; } 00360 L += log (1.0 - Prediction(F[v], HV)) + NegWgt * DotProduct(F[v], HV); 00361 } 00362 for (TIntFltH::TIter HI = HV.BegI(); HI < HV.EndI(); HI++) { 00363 double HOSum = HOVIDSV[VID].Len() > 0? HOSumFV[HI.GetKey()].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist 00364 L -= NegWgt * (SumFV[HI.GetKey()] - HOSum - GetComOut(VID, HI.GetKey())) * HI.GetDat(); 00365 } 00366 //add regularization 00367 if (RegCoef > 0.0) { //L1 00368 L -= RegCoef * Sum(HV); 00369 } 00370 if (RegCoef < 0.0) { //L2 00371 L += RegCoef * Norm2(HV); 00372 } 00373 00374 return L; 00375 } 00376 */ 00377 void TCoda::GradientForNode(const bool IsRow, const int UID, TIntFltH& GradU, const TIntSet& CIDSet) { 00378 GradU.Gen(CIDSet.Len()); 00379 00380 TFltV HOSumHV; //adjust for Hv of v hold out 00381 if (HOVIDSV[UID].Len() > 0) { 00382 HOSumHV.Gen(NumComs); 00383 for (int e = 0; e < HOVIDSV[UID].Len(); e++) { 00384 for (int c = 0; c < SumHV.Len(); c++) { 00385 HOSumHV[c] += GetCom(! IsRow, HOVIDSV[UID][e], c); 00386 } 00387 } 00388 } 00389 00390 TNGraph::TNodeI NI = G->GetNI(UID); 00391 int Deg = IsRow ? NI.GetOutDeg(): NI.GetInDeg(); 00392 TFltV PredV(Deg), GradV(CIDSet.Len()); 00393 TIntV CIDV(CIDSet.Len()); 00394 for (int e = 0; e < Deg; e++) { 00395 int VID = IsRow? NI.GetOutNId(e): NI.GetInNId(e); 00396 if (VID == UID) { continue; } 00397 if (HOVIDSV[UID].IsKey(VID)) { continue; } 00398 PredV[e] = IsRow? Prediction(UID, VID): Prediction(VID, UID); 00399 } 00400 00401 for (int c = 0; c < CIDSet.Len(); c++) { 00402 int CID = CIDSet.GetKey(c); 00403 double Val = 0.0; 00404 for (int e = 0; e < Deg; e++) { 00405 int VID = IsRow? NI.GetOutNId(e): NI.GetInNId(e); 00406 if (VID == UID) { continue; } 00407 if (HOVIDSV[UID].IsKey(VID)) { continue; } 00408 Val += PredV[e] * GetCom(! IsRow, VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(! IsRow, VID, CID); 00409 } 00410 double HOSum = HOVIDSV[UID].Len() > 0? HOSumHV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist 00411 Val -= NegWgt * (GetSumVal(! IsRow, CID) - HOSum - GetCom(! IsRow, UID, CID)); 00412 CIDV[c] = CID; 00413 GradV[c] = Val; 00414 } 00415 //add regularization 00416 if (RegCoef > 0.0) { //L1 00417 for (int c = 0; c < GradV.Len(); c++) { 00418 GradV[c] -= RegCoef; 00419 } 00420 } 00421 if (RegCoef < 0.0) { //L2 00422 for (int c = 0; c < GradV.Len(); c++) { 00423 GradV[c] += 2 * RegCoef * GetCom(IsRow, UID, CIDV[c]); 00424 } 00425 } 00426 for (int c = 0; c < GradV.Len(); c++) { 00427 if (GetCom(IsRow, UID, CIDV[c]) == 0.0 && GradV[c] < 0.0) { continue; } 00428 if (fabs(GradV[c]) < 0.0001) { continue; } 00429 GradU.AddDat(CIDV[c], GradV[c]); 00430 } 00431 for (int c = 0; c < GradU.Len(); c++) { 00432 if (GradU[c] >= 10) { GradU[c] = 10; } 00433 if (GradU[c] <= -10) { GradU[c] = -10; } 00434 IAssert(GradU[c] >= -10); 00435 } 00436 } 00437 00438 /* 00439 void TCoda::GradientForRow(const int UID, TIntFltH& GradU, const TIntSet& CIDSet) { 00440 GradU.Gen(CIDSet.Len()); 00441 00442 TFltV HOSumHV; //adjust for Hv of v hold out 00443 if (HOVIDSV[UID].Len() > 0) { 00444 HOSumHV.Gen(SumHV.Len()); 00445 00446 for (int e = 0; e < HOVIDSV[UID].Len(); e++) { 00447 for (int c = 0; c < SumHV.Len(); c++) { 00448 HOSumHV[c] += GetComIn(HOVIDSV[UID][e], c); 00449 } 00450 } 00451 } 00452 00453 TNGraph::TNodeI NI = G->GetNI(UID); 00454 int Deg = NI.GetOutDeg(); 00455 TFltV PredV(Deg), GradV(CIDSet.Len()); 00456 TIntV CIDV(CIDSet.Len()); 00457 for (int e = 0; e < Deg; e++) { 00458 if (NI.GetOutNId(e) == UID) { continue; } 00459 if (HOVIDSV[UID].IsKey(NI.GetOutNId(e))) { continue; } 00460 PredV[e] = Prediction(UID, NI.GetOutNId(e)); 00461 } 00462 00463 for (int c = 0; c < CIDSet.Len(); c++) { 00464 int CID = CIDSet.GetKey(c); 00465 double Val = 0.0; 00466 for (int e = 0; e < Deg; e++) { 00467 int VID = NI.GetNbrNId(e); 00468 if (VID == UID) { continue; } 00469 if (HOVIDSV[UID].IsKey(VID)) { continue; } 00470 Val += PredV[e] * GetComIn(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetComIn(VID, CID); 00471 } 00472 double HOSum = HOVIDSV[UID].Len() > 0? HOSumHV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist 00473 Val -= NegWgt * (SumHV[CID] - HOSum - GetComIn(UID, CID)); 00474 CIDV[c] = CID; 00475 GradV[c] = Val; 00476 } 00477 //add regularization 00478 if (RegCoef > 0.0) { //L1 00479 for (int c = 0; c < GradV.Len(); c++) { 00480 GradV[c] -= RegCoef; 00481 } 00482 } 00483 if (RegCoef < 0.0) { //L2 00484 for (int c = 0; c < GradV.Len(); c++) { 00485 GradV[c] += 2 * RegCoef * GetComOut(UID, CIDV[c]); 00486 } 00487 } 00488 for (int c = 0; c < GradV.Len(); c++) { 00489 if (GetComOut(UID, CIDV[c]) == 0.0 && GradV[c] < 0.0) { continue; } 00490 if (fabs(GradV[c]) < 0.0001) { continue; } 00491 GradU.AddDat(CIDV[c], GradV[c]); 00492 } 00493 for (int c = 0; c < GradU.Len(); c++) { 00494 if (GradU[c] >= 10) { GradU[c] = 10; } 00495 if (GradU[c] <= -10) { GradU[c] = -10; } 00496 IAssert(GradU[c] >= -10); 00497 } 00498 } 00499 */ 00500 00501 void TCoda::GetCmtyVV(const bool IsOut, TVec<TIntV>& CmtyVV) { 00502 GetCmtyVV(IsOut, CmtyVV, sqrt(1.0 / G->GetNodes()), 3); 00503 } 00504 00506 void TCoda::GetCommunity(TIntV& CmtyVIn, TIntV& CmtyVOut, const int CID, const double Thres) { 00507 TIntFltH NIDFucH(F.Len() / 10), NIDHucH(F.Len() / 10); 00508 for (int u = 0; u < NIDV.Len(); u++) { 00509 int NID = u; 00510 if (! NodesOk) { NID = NIDV[u]; } 00511 if (GetCom(true, u, CID) >= Thres) { NIDFucH.AddDat(NID, GetCom(true, u, CID)); } 00512 if (GetCom(false, u, CID) >= Thres) { NIDHucH.AddDat(NID, GetCom(false, u, CID)); } 00513 } 00514 NIDFucH.SortByDat(false); 00515 NIDHucH.SortByDat(false); 00516 NIDFucH.GetKeyV(CmtyVOut); 00517 NIDHucH.GetKeyV(CmtyVIn); 00518 } 00519 00520 void TCoda::GetTopCIDs(TIntV& CIdV, const int TopK, const int IsAverage, const int MinSz) { 00521 TIntFltH CIdFHH; 00522 for (int c = 0; c < GetNumComs(); c++) { 00523 if (IsAverage == 1) { 00524 TIntV CmtyVIn, CmtyVOut; 00525 GetCommunity(CmtyVIn, CmtyVOut, c); 00526 if (CmtyVIn.Len() == 0 || CmtyVOut.Len() == 0) { continue; } 00527 if (CmtyVIn.Len() < MinSz || CmtyVOut.Len() < MinSz) { continue; } 00528 CIdFHH.AddDat(c, GetSumVal(true, c) * GetSumVal(false, c) / (double) CmtyVIn.Len() / (double) CmtyVOut.Len()); 00529 } else { 00530 CIdFHH.AddDat(c, GetSumVal(true, c) * GetSumVal(false, c)); 00531 } 00532 } 00533 CIdFHH.SortByDat(false); 00534 CIdFHH.GetKeyV(CIdV); 00535 if (TopK < CIdFHH.Len()) { CIdV.Trunc(TopK); } 00536 } 00537 00539 void TCoda::GetCmtyVV(const bool IsOut, TVec<TIntV>& CmtyVV, const double Thres, const int MinSz) { 00540 CmtyVV.Gen(NumComs, 0); 00541 TIntFltH CIDSumFH(NumComs); 00542 for (int c = 0; c < NumComs; c++) { 00543 CIDSumFH.AddDat(c, GetSumVal(IsOut, c)); 00544 } 00545 CIDSumFH.SortByDat(false); 00546 for (int c = 0; c < NumComs; c++) { 00547 int CID = CIDSumFH.GetKey(c); 00548 TIntFltH NIDFucH, NIDHucH, NIDInOutH; 00549 TIntV CmtyV; 00550 GetNIDValH(NIDInOutH, NIDFucH, NIDHucH, CID, Thres); 00551 if (IsOut) { 00552 NIDFucH.GetKeyV(CmtyV); 00553 } else { 00554 NIDHucH.GetKeyV(CmtyV); 00555 } 00556 if (CmtyV.Len() >= MinSz) { CmtyVV.Add(CmtyV); } 00557 } 00558 if ( NumComs != CmtyVV.Len()) { 00559 printf("Community vector generated. %d communities are ommitted\n", NumComs.Val - CmtyVV.Len()); 00560 } 00561 } 00562 00563 void TCoda::GetCmtyVVUnSorted(const bool IsOut, TVec<TIntV>& CmtyVV, const double Thres, const int MinSz) { 00564 CmtyVV.Gen(NumComs, 0); 00565 for (int c = 0; c < NumComs; c++) { 00566 TIntV CmtyV((int) (GetSumVal(IsOut, c) * 10), 0); 00567 for (int u = 0; u < G->GetNodes(); u++) { 00568 if (GetCom(IsOut, u, c) > Thres) { CmtyV.Add(NIDV[u]); } 00569 } 00570 if (CmtyV.Len() >= MinSz) { CmtyVV.Add(CmtyV); } 00571 } 00572 if ( NumComs != CmtyVV.Len()) { 00573 printf("Community vector generated. %d communities are ommitted\n", NumComs.Val - CmtyVV.Len()); 00574 } 00575 } 00576 00577 PNGraph TCoda::GetGraphRawNID() { 00578 PNGraph NewG = TNGraph::New(G->GetNodes(), -1); 00579 for (TNGraph::TNodeI NI = G->BegNI(); NI < G->EndNI(); NI++) { 00580 //add node 00581 int NIdx = NI.GetId(); 00582 int NID = NIDV[NIdx]; 00583 if (! NewG->IsNode(NID)) { NewG->AddNode(NID); } 00584 //add edge 00585 for (int e = 0; e < NI.GetOutDeg(); e++) { 00586 int OutNID = NIDV[NI.GetOutNId(e)]; 00587 if (! NewG->IsNode(OutNID)) { NewG->AddNode(OutNID); } 00588 NewG->AddEdge(NID, OutNID); 00589 } 00590 } 00591 IAssert(G->GetNodes() == NewG->GetNodes()); 00592 IAssert(G->GetEdges() == NewG->GetEdges()); 00593 return NewG; 00594 } 00595 00596 void TCoda::GetNIDValH(TIntFltH& NIdValInOutH, TIntFltH& NIdValOutH, TIntFltH& NIdValInH, const int CID, const double Thres) { 00597 NIdValOutH.Gen((int) GetSumVal(true, CID) + 1); 00598 NIdValInH.Gen((int) GetSumVal(false, CID) + 1); 00599 NIdValInOutH.Gen((int) GetSumVal(false, CID) + 1); 00600 if (GetSumVal(true, CID) < Thres && GetSumVal(false, CID) < Thres) { return; } 00601 for (int u = 0; u < NIDV.Len(); u++) { 00602 if (GetCom(true, u, CID) >= Thres && GetCom(false, u, CID) >= Thres) { 00603 NIdValInOutH.AddDat(NIDV[u], GetCom(true, u, CID) + GetCom(false, u, CID)); 00604 } 00605 if (GetCom(true, u, CID) >= Thres) { 00606 NIdValOutH.AddDat(NIDV[u], GetCom(true, u, CID)); 00607 } 00608 if (GetCom(false, u, CID) >= Thres) { 00609 NIdValInH.AddDat(NIDV[u], GetCom(false, u, CID)); 00610 } 00611 } 00612 NIdValInH.SortByDat(false); 00613 NIdValOutH.SortByDat(false); 00614 NIdValInOutH.SortByDat(false); 00615 } 00616 00617 void TCoda::DumpMemberships(const TStr& OutFNm, const TStrHash<TInt>& NodeNameH, const double Thres) { 00618 if (NodeNameH.Len() > 0) { IAssert(NodeNameH.Len() == G->GetNodes()); } 00619 FILE* FId = fopen(OutFNm.CStr(), "wt"); 00620 TIntFltH CIDSumFH(NumComs); 00621 for (int c = 0; c < NumComs; c++) { 00622 CIDSumFH.AddDat(c, GetSumVal(true, c) * GetSumVal(false, c)); 00623 } 00624 CIDSumFH.SortByDat(false); 00625 for (int c = 0; c < NumComs; c++) { 00626 int CID = CIDSumFH.GetKey(c); 00627 TIntFltH NIDOutFH, NIDInFH, NIDInOutFH; 00628 GetNIDValH(NIDInOutFH, NIDOutFH, NIDInFH, CID, Thres); 00629 if (NIDOutFH.Len() == 0 || NIDInFH.Len() == 0) { continue; } 00630 00631 /* 00632 if (GetSumVal(true, CID) < Thres && GetSumVal(false, CID) < Thres) { continue; } 00633 for (int u = 0; u < NIDV.Len(); u++) { 00634 if (GetCom(true, u, CID) >= Thres && GetCom(false, u, CID) >= Thres) { 00635 NIDInOutFH.AddDat(u, GetCom(true, u, CID) + GetCom(false, u, CID)); 00636 } else if (GetCom(true, u, CID) >= Thres) { 00637 NIDOutFH.AddDat(u, GetCom(true, u, CID)); 00638 } else if (GetCom(false, u, CID) >= Thres) { 00639 NIDInFH.AddDat(u, GetCom(false, u, CID)); 00640 } 00641 } 00642 NIDInOutFH.SortByDat(false); 00643 NIDInFH.SortByDat(false); 00644 NIDOutFH.SortByDat(false); 00645 */ 00646 fprintf(FId, "%d\t%d\t%d\t%f\t%f\t%f\t", NIDInOutFH.Len(), NIDInFH.Len() - NIDInOutFH.Len(), NIDOutFH.Len() - NIDInOutFH.Len(), CIDSumFH.GetDat(CID).Val, GetSumVal(false, CID).Val, GetSumVal(true, CID).Val); 00647 fprintf(FId, "InOut:\t"); 00648 for (int u = 0; u < NIDInOutFH.Len(); u++) { 00649 int NIdx = NIDInOutFH.GetKey(u); 00650 fprintf(FId, "%s (%f)\t", NodeNameH.GetKey(NIdx), NIDInOutFH[u].Val); 00651 } 00652 fprintf(FId, "In:\t"); 00653 for (int u = 0; u < NIDInFH.Len(); u++) { 00654 int NIdx = NIDInFH.GetKey(u); 00655 fprintf(FId, "%s (%f)\t", NodeNameH.GetKey(NIdx), NIDInFH[u].Val); 00656 } 00657 fprintf(FId, "Out:\t"); 00658 for (int u = 0; u < NIDOutFH.Len(); u++) { 00659 int NIdx = NIDOutFH.GetKey(u); 00660 fprintf(FId, "%s (%f)\t", NodeNameH.GetKey(NIdx), NIDOutFH[u].Val); 00661 } 00662 fprintf(FId, "\n"); 00663 } 00664 fclose(FId); 00665 } 00666 00667 00668 void TCoda::DumpMemberships(const TStr& OutFNm, const double Thres) { 00669 TStrHash<TInt> NodeNameH(G->GetNodes(), false); 00670 for (int u = 0; u < NIDV.Len(); u++) { NodeNameH.AddKey(TStr::Fmt("%d", NIDV[u].Val)); } 00671 DumpMemberships(OutFNm, NodeNameH, Thres); 00672 } 00673 00674 void TCoda::GetCmtyS(TIntSet& CmtySOut, TIntSet& CmtySIn, const int CID, const double Thres) { 00675 CmtySOut.Gen(G->GetNodes() / 10); 00676 CmtySIn.Gen(G->GetNodes() / 10); 00677 for (int u = 0; u < NIDV.Len(); u++) { 00678 if (GetCom(true, u, CID) > Thres) { 00679 //CmtySOut.AddKey( 00680 } 00681 } 00682 } 00683 00684 /* 00685 void TCoda::GetCmtyVVIn(TVec<TIntV>& CmtyVV) { 00686 GetCmtyVVIn(CmtyVV, sqrt(1.0 / G->GetNodes()), 3); 00687 } 00688 00690 void TCoda::GetCmtyVVIn(TVec<TIntV>& CmtyVV, const double Thres, const int MinSz) { 00691 CmtyVV.Gen(NumComs, 0); 00692 TIntFltH CIDSumHH(NumComs); 00693 for (int c = 0; c < SumHV.Len(); c++) { 00694 CIDSumHH.AddDat(c, SumHV[c]); 00695 } 00696 CIDSumHH.SortByDat(false); 00697 for (int c = 0; c < NumComs; c++) { 00698 int CID = CIDSumHH.GetKey(c); 00699 TIntFltH NIDHucH(H.Len() / 10); 00700 TIntV CmtyV; 00701 IAssert(SumHV[CID] == CIDSumHH.GetDat(CID)); 00702 if (SumHV[CID] < Thres) { continue; } 00703 for (int u = 0; u < H.Len(); u++) { 00704 int NID = u; 00705 if (! NodesOk) { NID = NIDV[u]; } 00706 if (GetComIn(u, CID) >= Thres) { NIDHucH.AddDat(NID, GetComIn(u, CID)); } 00707 } 00708 NIDHucH.SortByDat(false); 00709 NIDHucH.GetKeyV(CmtyV); 00710 if (CmtyV.Len() >= MinSz) { CmtyVV.Add(CmtyV); } 00711 } 00712 if ( NumComs != CmtyVV.Len()) { 00713 printf("Community vector generated. %d communities are ommitted\n", NumComs.Val - CmtyVV.Len()); 00714 } 00715 } 00716 */ 00717 void TCoda::GetCmtyVV(TVec<TIntV>& CmtyVVOut, TVec<TIntV>& CmtyVVIn, const int MinSz) { 00718 GetCmtyVV(false, CmtyVVIn, sqrt(1.0 / G->GetNodes()), MinSz); 00719 GetCmtyVV(true, CmtyVVOut, sqrt(1.0 / G->GetNodes()), MinSz); 00720 } 00721 00722 void TCoda::GetCmtyVVUnSorted(TVec<TIntV>& CmtyVVOut, TVec<TIntV>& CmtyVVIn) { 00723 GetCmtyVVUnSorted(false, CmtyVVIn, sqrt(1.0 / G->GetNodes())); 00724 GetCmtyVVUnSorted(true, CmtyVVOut, sqrt(1.0 / G->GetNodes())); 00725 } 00726 00727 void TCoda::GetCmtyVV(TVec<TIntV>& CmtyVVOut, TVec<TIntV>& CmtyVVIn, const double ThresOut, const double ThresIn, const int MinSz) { 00728 GetCmtyVV(false, CmtyVVIn, ThresIn, MinSz); 00729 GetCmtyVV(true, CmtyVVOut, ThresOut, MinSz); 00730 } 00731 00733 int TCoda::FindComsByCV(const int NumThreads, const int MaxComs, const int MinComs, const int DivComs, const TStr OutFNm, const int EdgesForCV, const double StepAlpha, const double StepBeta) { 00734 double ComsGap = exp(TMath::Log((double) MaxComs / (double) MinComs) / (double) DivComs); 00735 TIntV ComsV; 00736 ComsV.Add(MinComs); 00737 while (ComsV.Len() < DivComs) { 00738 int NewComs = int(ComsV.Last() * ComsGap); 00739 if (NewComs == ComsV.Last().Val) { NewComs++; } 00740 ComsV.Add(NewComs); 00741 } 00742 if (ComsV.Last() < MaxComs) { ComsV.Add(MaxComs); } 00743 return FindComsByCV(ComsV, 0.1, NumThreads, OutFNm, EdgesForCV, StepAlpha, StepBeta); 00744 } 00745 00746 int TCoda::FindComsByCV(TIntV& ComsV, const double HOFrac, const int NumThreads, const TStr PlotLFNm, const int EdgesForCV, const double StepAlpha, const double StepBeta) { 00747 if (ComsV.Len() == 0) { 00748 int MaxComs = G->GetNodes() / 5; 00749 ComsV.Add(2); 00750 while(ComsV.Last() < MaxComs) { ComsV.Add(ComsV.Last() * 2); } 00751 } 00752 int MaxIterCV = 3; 00753 00754 TVec<TVec<TIntSet> > HoldOutSets(MaxIterCV); 00755 TFltIntPrV NIdPhiV; 00756 TAGMFastUtil::GetNIdPhiV<PNGraph>(G, NIdPhiV); 00757 00758 if (G->GetEdges() > EdgesForCV) { //if edges are many enough, use CV 00759 printf("generating hold out set\n"); 00760 TIntV NIdV1, NIdV2; 00761 G->GetNIdV(NIdV1); 00762 G->GetNIdV(NIdV2); 00763 for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) { 00764 // generate holdout sets 00765 TAGMFastUtil::GenHoldOutPairs(G, HoldOutSets[IterCV], HOFrac, Rnd); 00766 /* 00767 HoldOutSets[IterCV].Gen(G->GetNodes()); 00768 const int HOTotal = int(HOFrac * G->GetNodes() * (G->GetNodes() - 1) / 2.0); 00769 int HOCnt = 0; 00770 int HOEdges = (int) TMath::Round(HOFrac * G->GetEdges()); 00771 printf("holding out %d edges...\n", HOEdges); 00772 for (int he = 0; he < (int) HOEdges; he++) { 00773 HoldOutSets[IterCV][EdgeV[he].Val1].AddKey(EdgeV[he].Val2); 00774 HoldOutSets[IterCV][EdgeV[he].Val2].AddKey(EdgeV[he].Val1); 00775 HOCnt++; 00776 } 00777 printf("%d Edges hold out\n", HOCnt); 00778 while(HOCnt++ < HOTotal) { 00779 int SrcNID = Rnd.GetUniDevInt(G->GetNodes()); 00780 int DstNID = Rnd.GetUniDevInt(G->GetNodes()); 00781 HoldOutSets[IterCV][SrcNID].AddKey(DstNID); 00782 HoldOutSets[IterCV][DstNID].AddKey(SrcNID); 00783 } 00784 */ 00785 } 00786 00787 printf("hold out set generated\n"); 00788 } 00789 00790 TFltV HOLV(ComsV.Len()); 00791 TIntFltPrV ComsLV; 00792 for (int c = 0; c < ComsV.Len(); c++) { 00793 const int Coms = ComsV[c]; 00794 printf("Try number of Coms:%d\n", Coms); 00795 00796 if (G->GetEdges() > EdgesForCV) { //if edges are many enough, use CV 00797 for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) { 00798 HOVIDSV = HoldOutSets[IterCV]; 00799 NeighborComInit(NIdPhiV, Coms); 00800 printf("Initialized\n"); 00801 00802 if (NumThreads == 1) { 00803 printf("MLE without parallelization begins\n"); 00804 MLEGradAscent(0.05, 10 * G->GetNodes(), "", StepAlpha, StepBeta); 00805 } else { 00806 printf("MLE with parallelization begins\n"); 00807 MLEGradAscentParallel(0.05, 100, NumThreads, "", StepAlpha, StepBeta); 00808 } 00809 double HOL = LikelihoodHoldOut(); 00810 HOL = HOL < 0? HOL: TFlt::Mn; 00811 HOLV[c] += HOL; 00812 } 00813 } 00814 else { 00815 HOVIDSV.Gen(G->GetNodes()); 00816 MLEGradAscent(0.0001, 100 * G->GetNodes(), ""); 00817 double BIC = 2 * Likelihood() - (double) G->GetNodes() * Coms * 2.0 * log ( (double) G->GetNodes()); 00818 HOLV[c] = BIC; 00819 } 00820 } 00821 int EstComs = 2; 00822 double MaxL = TFlt::Mn; 00823 printf("\n"); 00824 for (int c = 0; c < ComsV.Len(); c++) { 00825 ComsLV.Add(TIntFltPr(ComsV[c].Val, HOLV[c].Val)); 00826 printf("%d(%f)\t", ComsV[c].Val, HOLV[c].Val); 00827 if (MaxL < HOLV[c]) { 00828 MaxL = HOLV[c]; 00829 EstComs = ComsV[c]; 00830 } 00831 } 00832 printf("\n"); 00833 RandomInit(EstComs); 00834 HOVIDSV.Gen(G->GetNodes()); 00835 if (! PlotLFNm.Empty()) { 00836 TGnuPlot::PlotValV(ComsLV, PlotLFNm, "hold-out likelihood", "communities", "likelihood"); 00837 } 00838 return EstComs; 00839 } 00840 00841 double TCoda::LikelihoodHoldOut(const bool DoParallel) { 00842 double L = 0.0; 00843 for (int u = 0; u < HOVIDSV.Len(); u++) { 00844 for (int e = 0; e < HOVIDSV[u].Len(); e++) { 00845 int VID = HOVIDSV[u][e]; 00846 if (VID == u) { continue; } 00847 double Pred = Prediction(u, VID); 00848 if (G->IsEdge(u, VID)) { 00849 L += log(1.0 - Pred); 00850 } 00851 else { 00852 L += NegWgt * log(Pred); 00853 } 00854 } 00855 } 00856 return L; 00857 } 00858 00859 double TCoda::GetStepSizeByLineSearch(const bool IsRow, const int UID, const TIntFltH& DeltaV, const TIntFltH& GradV, const double& Alpha, const double& Beta, const int MaxIter) { 00860 double StepSize = 1.0; 00861 double InitLikelihood = LikelihoodForNode(IsRow, UID); 00862 TIntFltH NewVarV(DeltaV.Len()); 00863 for(int iter = 0; iter < MaxIter; iter++) { 00864 for (int i = 0; i < DeltaV.Len(); i++){ 00865 int CID = DeltaV.GetKey(i); 00866 double NewVal; 00867 NewVal = GetCom(IsRow, UID, CID) + StepSize * DeltaV.GetDat(CID); 00868 if (NewVal < MinVal) { NewVal = MinVal; } 00869 if (NewVal > MaxVal) { NewVal = MaxVal; } 00870 NewVarV.AddDat(CID, NewVal); 00871 } 00872 if (LikelihoodForNode(IsRow, UID, NewVarV) < InitLikelihood + Alpha * StepSize * DotProduct(GradV, DeltaV)) { 00873 StepSize *= Beta; 00874 } else { 00875 break; 00876 } 00877 if (iter == MaxIter - 1) { 00878 StepSize = 0.0; 00879 break; 00880 } 00881 } 00882 return StepSize; 00883 } 00884 00885 int TCoda::MLEGradAscent(const double& Thres, const int& MaxIter, const TStr PlotNm, const double StepAlpha, const double StepBeta) { 00886 time_t InitTime = time(NULL); 00887 TExeTm ExeTm, CheckTm; 00888 int iter = 0, PrevIter = 0; 00889 TIntFltPrV IterLV; 00890 TNGraph::TNodeI UI; 00891 double PrevL = TFlt::Mn, CurL = 0.0; 00892 TIntV NIdxV(F.Len(), 0); 00893 for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); } 00894 IAssert(NIdxV.Len() == F.Len()); 00895 TIntFltH GradV; 00896 while(iter < MaxIter) { 00897 NIdxV.Shuffle(Rnd); 00898 for (int ui = 0; ui < F.Len(); ui++, iter++) { 00899 const bool IsRow = (ui % 2 == 0); 00900 int u = NIdxV[ui]; // 00901 //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) 00902 UI = G->GetNI(u); 00903 const int Deg = IsRow? UI.GetOutDeg(): UI.GetInDeg(); 00904 TIntSet CIDSet(5 * Deg); 00905 for (int e = 0; e < Deg; e++) { 00906 int VID = IsRow? UI.GetOutNId(e): UI.GetInNId(e); 00907 if (HOVIDSV[u].IsKey(VID)) { continue; } 00908 TIntFltH NbhCIDH = IsRow? H[VID]: F[VID]; 00909 for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { 00910 CIDSet.AddKey(CI.GetKey()); 00911 //printf("CI.GetKey:%d\n", CI.GetKey()); 00912 IAssert(CI.GetKey() <= NumComs); 00913 } 00914 } 00915 TIntFltH& CurMem = IsRow? F[u]: H[u]; 00916 for (TIntFltH::TIter CI = CurMem.BegI(); CI < CurMem.EndI(); CI++) { //remove the community membership which U does not share with its neighbors 00917 if (! CIDSet.IsKey(CI.GetKey())) { 00918 DelCom(IsRow, u, CI.GetKey()); 00919 } 00920 } 00921 if (CIDSet.Empty()) { continue; } 00922 GradientForNode(IsRow, u, GradV, CIDSet); 00923 if (Norm2(GradV) < 1e-4) { continue; } 00924 double LearnRate = GetStepSizeByLineSearch(IsRow, u, GradV, GradV, StepAlpha, StepBeta); 00925 if (LearnRate == 0.0) { continue; } 00926 for (int ci = 0; ci < GradV.Len(); ci++) { 00927 int CID = GradV.GetKey(ci); 00928 double Change = LearnRate * GradV.GetDat(CID); 00929 double NewFuc = GetCom(IsRow, u, CID) + Change; 00930 if (NewFuc <= 0.0) { 00931 DelCom(IsRow, u, CID); 00932 } else { 00933 AddCom(IsRow, u, CID, NewFuc); 00934 } 00935 } 00936 if (! PlotNm.Empty() && (iter + 1) % G->GetNodes() == 0) { 00937 IterLV.Add(TIntFltPr(iter, Likelihood(false))); 00938 } 00939 } 00940 printf("\r%d iterations (%f) [%lu sec]", iter, CurL, time(NULL) - InitTime); 00941 fflush(stdout); 00942 if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) { 00943 PrevIter = iter; 00944 CurL = Likelihood(); 00945 if (PrevL > TFlt::Mn && ! PlotNm.Empty()) { 00946 printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL, CurL - PrevL); 00947 } 00948 fflush(stdout); 00949 if (CurL - PrevL <= Thres * fabs(PrevL)) { break; } 00950 else { PrevL = CurL; } 00951 } 00952 00953 } 00954 printf("\n"); 00955 printf("MLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr()); 00956 if (! PlotNm.Empty()) { 00957 TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); 00958 } 00959 return iter; 00960 } 00961 00962 int TCoda::MLEGradAscentParallel(const double& Thres, const int& MaxIter, const int ChunkNum, const int ChunkSize, const TStr PlotNm, const double StepAlpha, const double StepBeta) { 00963 //parallel 00964 time_t InitTime = time(NULL); 00965 //uint64 StartTm = TSecTm::GetCurTm().GetAbsSecs(); 00966 TExeTm ExeTm, CheckTm; 00967 double PrevL = Likelihood(true); 00968 TIntFltPrV IterLV; 00969 int PrevIter = 0; 00970 int iter = 0; 00971 TIntV NIdxV(F.Len(), 0); 00972 for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); } 00973 TIntV NIDOPTV(F.Len()); //check if a node needs optimization or not 1: does not require optimization 00974 NIDOPTV.PutAll(0); 00975 TVec<TIntFltH> NewF(ChunkNum * ChunkSize); 00976 TIntV NewNIDV(ChunkNum * ChunkSize); 00977 TBoolV IsRowV(ChunkNum * ChunkSize); 00978 for (iter = 0; iter < MaxIter; iter++) { 00979 NIdxV.Clr(false); 00980 for (int i = 0; i < F.Len(); i++) { 00981 //if (NIDOPTV[i] == 0) { NIdxV.Add(i); } 00982 NIdxV.Add(i); 00983 } 00984 IAssert (NIdxV.Len() <= F.Len()); 00985 NIdxV.Shuffle(Rnd); 00986 // compute gradient for chunk of nodes 00987 #pragma omp parallel for schedule(static, 1) 00988 for (int TIdx = 0; TIdx < ChunkNum; TIdx++) { 00989 TIntFltH GradV; 00990 for (int ui = TIdx * ChunkSize; ui < (TIdx + 1) * ChunkSize; ui++) { 00991 const bool IsRow = (ui % 2 == 0); 00992 NewNIDV[ui] = -1; 00993 if (ui > NIdxV.Len()) { continue; } 00994 const int u = NIdxV[ui]; // 00995 //find set of candidate c (we only need to consider c to which a neighbor of u belongs to) 00996 TNGraph::TNodeI UI = G->GetNI(u); 00997 const int Deg = IsRow? UI.GetOutDeg(): UI.GetInDeg(); 00998 TIntSet CIDSet(5 * Deg); 00999 TIntFltH CurFU = IsRow? F[u]: H[u]; 01000 for (int e = 0; e < Deg; e++) { 01001 int VID = IsRow? UI.GetOutNId(e): UI.GetInNId(e); 01002 if (HOVIDSV[u].IsKey(VID)) { continue; } 01003 TIntFltH& NbhCIDH = IsRow? H[VID]: F[VID]; 01004 for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) { 01005 CIDSet.AddKey(CI.GetKey()); 01006 } 01007 } 01008 if (CIDSet.Empty()) { 01009 CurFU.Clr(); 01010 } 01011 else { 01012 for (TIntFltH::TIter CI = CurFU.BegI(); CI < CurFU.EndI(); CI++) { //remove the community membership which U does not share with its neighbors 01013 if (! CIDSet.IsKey(CI.GetKey())) { 01014 CurFU.DelIfKey(CI.GetKey()); 01015 } 01016 } 01017 GradientForNode(IsRow, u, GradV, CIDSet); 01018 if (Norm2(GradV) < 1e-4) { NIDOPTV[u] = 1; continue; } 01019 double LearnRate = GetStepSizeByLineSearch(IsRow, u, GradV, GradV, StepAlpha, StepBeta); 01020 if (LearnRate == 0.0) { NewNIDV[ui] = -2; continue; } 01021 for (int ci = 0; ci < GradV.Len(); ci++) { 01022 int CID = GradV.GetKey(ci); 01023 double Change = LearnRate * GradV.GetDat(CID); 01024 double NewFuc = CurFU.IsKey(CID)? CurFU.GetDat(CID) + Change : Change; 01025 if (NewFuc <= 0.0) { 01026 CurFU.DelIfKey(CID); 01027 } else { 01028 CurFU.AddDat(CID) = NewFuc; 01029 } 01030 } 01031 CurFU.Defrag(); 01032 } 01033 //store changes 01034 NewF[ui] = CurFU; 01035 NewNIDV[ui] = u; 01036 IsRowV[ui] = IsRow; 01037 } 01038 } 01039 int NumNoChangeGrad = 0; 01040 int NumNoChangeStepSize = 0; 01041 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 01042 int NewNID = NewNIDV[ui]; 01043 if (NewNID == -1) { NumNoChangeGrad++; continue; } 01044 if (NewNID == -2) { NumNoChangeStepSize++; continue; } 01045 if (IsRowV[ui]) { 01046 for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) { 01047 SumFV[CI.GetKey()] -= CI.GetDat(); 01048 } 01049 } else { 01050 for (TIntFltH::TIter CI = H[NewNID].BegI(); CI < H[NewNID].EndI(); CI++) { 01051 SumHV[CI.GetKey()] -= CI.GetDat(); 01052 } 01053 } 01054 } 01055 #pragma omp parallel for 01056 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 01057 int NewNID = NewNIDV[ui]; 01058 if (NewNID < 0) { continue; } 01059 if (IsRowV[ui]) { 01060 F[NewNID] = NewF[ui]; 01061 } else { 01062 H[NewNID] = NewF[ui]; 01063 } 01064 } 01065 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 01066 int NewNID = NewNIDV[ui]; 01067 if (NewNID < 0) { continue; } 01068 if (IsRowV[ui]) { 01069 for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) { 01070 SumFV[CI.GetKey()] += CI.GetDat(); 01071 } 01072 } else { 01073 for (TIntFltH::TIter CI = H[NewNID].BegI(); CI < H[NewNID].EndI(); CI++) { 01074 SumHV[CI.GetKey()] += CI.GetDat(); 01075 } 01076 } 01077 } 01078 // update the nodes who are optimal 01079 for (int ui = 0; ui < NewNIDV.Len(); ui++) { 01080 int NewNID = NewNIDV[ui]; 01081 if (NewNID < 0) { continue; } 01082 TNGraph::TNodeI UI = G->GetNI(NewNID); 01083 NIDOPTV[NewNID] = 0; 01084 for (int e = 0; e < UI.GetDeg(); e++) { 01085 NIDOPTV[UI.GetNbrNId(e)] = 0; 01086 } 01087 } 01088 int OPTCnt = 0; 01089 for (int i = 0; i < NIDOPTV.Len(); i++) { if (NIDOPTV[i] == 1) { OPTCnt++; } } 01090 /* 01091 if (! PlotNm.Empty()) { 01092 printf("\r%d iterations [%s] %lu secs", iter * ChunkSize * ChunkNum, ExeTm.GetTmStr(), TSecTm::GetCurTm().GetAbsSecs() - StartTm); 01093 if (PrevL > TFlt::Mn) { printf(" (%f) %d g %d s %d OPT", PrevL, NumNoChangeGrad, NumNoChangeStepSize, OPTCnt); } 01094 fflush(stdout); 01095 } 01096 */ 01097 if ((iter - PrevIter) * ChunkSize * ChunkNum >= G->GetNodes()) { 01098 PrevIter = iter; 01099 double CurL = Likelihood(true); 01100 IterLV.Add(TIntFltPr(iter * ChunkSize * ChunkNum, CurL)); 01101 printf("\r%d iterations, Likelihood: %f, Diff: %f [%lu secs]", iter, CurL, CurL - PrevL, time(NULL) - InitTime); 01102 fflush(stdout); 01103 if (CurL - PrevL <= Thres * fabs(PrevL)) { 01104 break; 01105 } 01106 else { 01107 PrevL = CurL; 01108 } 01109 } 01110 } 01111 if (! PlotNm.Empty()) { 01112 printf("\nMLE completed with %d iterations(%lu secs)\n", iter, time(NULL) - InitTime); 01113 TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q"); 01114 } else { 01115 printf("\rMLE completed with %d iterations(%lu secs)\n", iter, time(NULL) - InitTime); 01116 fflush(stdout); 01117 } 01118 return iter; 01119 }