SNAP Library 6.0, Developer Reference  2020-12-09 16:24:20
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
randwalk.h
Go to the documentation of this file.
1 //
2 // randwalk.h
3 // snap-core
4 //
5 // Created by Peter Lofgren on 8/5/15.
6 // Copyright (c) 2015 infolab. All rights reserved.
7 //
8 #ifndef snap_core_randwalk_h
9 #define snap_core_randwalk_h
10 
11 #include <stdio.h>
12 #include "priorityqueue.h"
13 #include "Snap.h"
14 
15 // use anonymous namespace to limit name to this file.
16 namespace {
17  double WallClockTime() {
18  // from http://stackoverflow.com/questions/588307/c-obtaining-milliseconds-time-on-linux-clock-doesnt-seem-to-work-properl/
19  struct timeval t;
20  gettimeofday(&t, NULL);
21  return t.tv_sec + t.tv_usec / 1.0e6;
22  }
23 
24  // Given a target nodeId, computes results in two maps Estimates and Residuals and one value MaxResidual.
25  // Each value Estimates.GetDat(v)
26  // approximates ppr(v, target) with additive error MaxResidual, and the residuals satisfy a
27  // useful loop invariant. See the paper "Local Computation of PageRank Contributions" for details.
28  // This variant is for the balanced variant of bidirectional ppr. It does reverse pushes, decreasing maxResidual incrementally,
29  // until the time spent equals the remaining time required for forward walks, as estimated using ForwardSecondsRMaxRatio
30  // (which is the ratio between the expected forward random walk time and the current MaxResidual that hasn't been pushed back).
31  template <class PGraph>
32  void ApproxContributionsBalanced(const PGraph& Graph,
33  double JumpProb,
34  int TargetNId,
35  float ForwardSecondsRMaxRatio,
36  TIntFltH& ResultEstimates,
37  TIntFltH& ResultResiduals,
38  float& ResultMaxResidual) {
39  double startTime = WallClockTime();
40  TMaxPriorityQueue<TInt> nodesByResidual;
41  nodesByResidual.Insert(TargetNId, 1.0f);
42  while (!nodesByResidual.IsEmpty() &&
43  WallClockTime() - startTime < ForwardSecondsRMaxRatio * nodesByResidual.GetMaxPriority()) {
44  float vPriority = nodesByResidual.GetMaxPriority();
45  int vId = nodesByResidual.PopMax();
46  ResultEstimates(vId) = ResultEstimates(vId) + JumpProb * vPriority;
47  //printf("set estimate(%d) to %g\n", vId, double(ResultEstimates(vId)));
48  TNGraph::TNodeI v = Graph->GetNI(vId);
49  for (int i = 0; i < v.GetInDeg(); i++) {
50  int uId = v.GetInNId(i);
51  TNGraph::TNodeI u = Graph->GetNI(uId);
52  float residualChange = (1.0 - JumpProb) * vPriority / u.GetOutDeg();
53  nodesByResidual.SetPriority(uId, nodesByResidual.GetPriority(uId) + residualChange);
54  //printf("set priority(%d) to %g\n", uId, double(nodesByResidual.GetPriority(uId)));
55  }
56  }
57 
58  // Copy residuals from nodesByResidual to ResultResiduals
59  nodesByResidual.GetPriorities(ResultResiduals);
60  ResultMaxResidual = nodesByResidual.IsEmpty() ? 0.0f : nodesByResidual.GetMaxPriority();
61  }
62 }
63 
64 namespace TSnap {
65 // Returns the endpoint of a random walk sampled from a random start node in StartNIdV. The walk has expected length 1/JumpProb, and restarts if it reaches a dead-end node (one with no out-neighbors).
66 template <class PGraph>
67 int SamplePersonalizedPageRank(const PGraph& Graph, double JumpProb, const TIntV& StartNIdV, TRnd& Rnd) {
68  int locationId = StartNIdV.GetRndVal(Rnd);
69  //printf("starting walk at %d\n", locationId);
70  while (Rnd.GetUniDev() >= JumpProb) {
71  TNGraph::TNodeI location = Graph->GetNI(locationId);
72  int d = location.GetOutDeg();
73  if (d > 0)
74  locationId = location.GetOutNId(Rnd.GetUniDevInt(d));
75  else
76  locationId = StartNIdV.GetRndVal(Rnd);
77  }
78  return locationId;
79 }
80 
81 // Returns the Personalized PageRank from given vector of start node ids to the given target. This is equal to the probability
82 // that a random walk from a source uniformly sampled from the given start node ids stops at the given target node id, where
83 // the walk stops after each stop with probability JumpProb. As long as the true PPR is greater than MinProbability (which defaults
84 // to 1/n on an n node graph), the mean relative error will be approximately the given RelativeError. If provableRelativeError is set to
85 // true, the result will provably (with high probability) have at most the given RelativeError, but this comes at the cost of significantly greater
86 // running time. Uses the algorithm presented in "Personalized PageRank Estimation and Search: A Bidirectional Approach" by Lofgren, Banerjee, and Goel.
87 template <class PGraph>
88  double GetPersonalizedPageRankBidirectional(const PGraph& Graph,
89  double JumpProb,
90  const TIntV& StartNIdV,
91  int TargetNId,
92  double MinProbability = -1.0,
93  double RelativeError = 0.1,
94  bool provableRelativeError = false,
95  bool PrintTimeForTuning = false) {
96  if (MinProbability <= 0.0) { // Check if minProbability not set.
97  MinProbability = 1.0 / Graph->GetNodes();
98  }
99  // In experiments, when relativeError = 0.1, a chernoff constant of 0.07 gave mean relative error less than 0.1 on several realistic graphs.
100  float kChernoffConstant = provableRelativeError ? 12 * exp((double) 1) * log(2 / 1.0e-9) : 0.07;
101  float kSecondsPerWalk = 4.0e-7; // The time required to generate a random walk. Can be tuned so that forward and reverse running times are equal, to improve running time
102  float WalkCountRMaxRatio = kChernoffConstant / (RelativeError * RelativeError) / MinProbability;
103  float ForwardSecondsRMaxRatio = kSecondsPerWalk * WalkCountRMaxRatio;
104 
105 
106  double startTime = WallClockTime();
107  // Results from ApproxContributionsBalanced are set by reference:
108  TIntFltH Estimates, Residuals;
109  float MaxResidual;
110  ApproxContributionsBalanced(Graph, JumpProb, TargetNId, ForwardSecondsRMaxRatio, Estimates, Residuals, MaxResidual);
111 
112  double reverseTime = WallClockTime() - startTime;
113  startTime = WallClockTime();
114 
115  double Estimate = 0.0;
116  // First incorporate the average Estimates value for starting nodes
117  for (int i = 0; i < StartNIdV.Len(); i++) {
118  Estimate += Estimates.GetDatWithDefault(StartNIdV[i], 0.0) / StartNIdV.Len();
119  }
120 
121  int RandomWalkCount = static_cast<int>(WalkCountRMaxRatio * MaxResidual);
122  TRnd Rnd(0); // 0 means seed from clock. We use an explicit Rnd for thread safety.
123  for (int i = 0; i < RandomWalkCount; i++) {
124  int vId = SamplePersonalizedPageRank(Graph, JumpProb, StartNIdV, Rnd);
125  Estimate += Residuals.GetDatWithDefault(vId, 0.0) / RandomWalkCount;
126  }
127  double forwardTime = WallClockTime() - startTime;
128  if (PrintTimeForTuning) printf("forwardTime reverseTime %g %g\n", forwardTime, reverseTime);
129 
130  return Estimate;
131 }
132 
133 // Convenience method that takes a single startNId and converts it to a vector of length 1 before calling GetPersonalizedPageRankBidirectional.
134 template <class PGraph>
135  double GetRndWalkRestartBidirectional(const PGraph& Graph,
136  double JumpProb,
137  int StartNId,
138  int TargetNId,
139  double minProbability = -1.0,
140  double relativeError = 0.1,
141  bool proveRelativeError = false,
142  bool PrintTimeForTuning = false) {
143  return GetPersonalizedPageRankBidirectional(Graph, JumpProb, TIntV::GetV(StartNId), TargetNId,
144  minProbability, relativeError, proveRelativeError, PrintTimeForTuning);
145  }
146 
147 }; // namespace TSnap
148 
149 #endif
Main namespace for all the Snap global entities.
Definition: alg.h:1
void ApproxContributionsBalanced(const PGraph &Graph, double JumpProb, int TargetNId, float ForwardSecondsRMaxRatio, TIntFltH &ResultEstimates, TIntFltH &ResultResiduals, float &ResultMaxResidual)
Definition: randwalk.h:32
float GetPriority(const TVal &X)
Definition: priorityqueue.h:49
Definition: dt.h:11
void GetPriorities(THash< TVal, TFlt > &Result)
Definition: priorityqueue.h:83
double GetRndWalkRestartBidirectional(const PGraph &Graph, double JumpProb, int StartNId, int TargetNId, double minProbability=-1.0, double relativeError=0.1, bool proveRelativeError=false, bool PrintTimeForTuning=false)
Definition: randwalk.h:135
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
TDat GetDatWithDefault(const TKey &Key, TDat DefaultValue)
Definition: hash.h:264
int SamplePersonalizedPageRank(const PGraph &Graph, double JumpProb, const TIntV &StartNIdV, TRnd &Rnd)
Definition: randwalk.h:67
const TVal & GetRndVal(TRnd &Rnd=TInt::Rnd) const
Returns a reference to a random element in the vector.
Definition: ds.h:589
void SetPriority(const TVal &X, float NewPriority)
Definition: priorityqueue.h:31
int GetOutDeg() const
Returns out-degree of the current node.
Definition: graph.h:406
void Insert(const TVal &X, float Priority)
Definition: priorityqueue.h:23
Node iterator. Only forward iteration (operator++) is supported.
Definition: graph.h:383
double GetUniDev()
Definition: dt.h:30
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
static TVec< TInt, int > GetV(const TInt &Val1)
Returns a vector on element Val1.
Definition: ds.h:848
int GetInDeg() const
Returns in-degree of the current node.
Definition: graph.h:404
float GetMaxPriority()
Definition: priorityqueue.h:57
int GetInNId(const int &NodeN) const
Returns ID of NodeN-th in-node (the node pointing to the current node).
Definition: graph.h:412
double GetPersonalizedPageRankBidirectional(const PGraph &Graph, double JumpProb, const TIntV &StartNIdV, int TargetNId, double MinProbability=-1.0, double RelativeError=0.1, bool provableRelativeError=false, bool PrintTimeForTuning=false)
Definition: randwalk.h:88
int GetOutNId(const int &NodeN) const
Returns ID of NodeN-th out-node (the node the current node points to).
Definition: graph.h:416
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:430