SNAP Library, Developer Reference
2012-10-15 15:06:59
SNAP, a general purpose network analysis and graph mining library
|
00001 00002 // Forward 00003 ClassHdTP(THtmlTok, PHtmlTok) 00004 ClassHdTP(THtmlDoc, PHtmlDoc) 00005 00007 // Html-Lexical-Chars 00008 typedef enum { 00009 hlctSpace, hlctAlpha, hlctNum, hlctSym, 00010 hlctLTag, hlctRTag, hlctEof} THtmlLxChTy; 00011 00012 ClassTP(THtmlLxChDef, PHtmlLxChDef)//{ 00013 private: 00014 TIntV ChTyV; 00015 TChV UcChV; 00016 TChV LcChV; 00017 TStrStrH EscStrH; 00018 void SetUcCh(const char& UcCh, const char& LcCh); 00019 void SetUcCh(const TStr& Str); 00020 void SetChTy(const THtmlLxChTy& ChTy, const TStr& Str); 00021 void SetEscStr(const TStr& SrcStr, const TStr& DstStr); 00022 public: 00023 THtmlLxChDef(); 00024 THtmlLxChDef(TSIn& SIn): ChTyV(SIn), UcChV(SIn), LcChV(SIn), EscStrH(SIn){} 00025 static PHtmlLxChDef Load(TSIn& SIn){return new THtmlLxChDef(SIn);} 00026 void Save(TSOut& SOut){ 00027 ChTyV.Save(SOut); UcChV.Save(SOut); LcChV.Save(SOut); EscStrH.Save(SOut);} 00028 00029 THtmlLxChDef& operator=(const THtmlLxChDef&){Fail; return *this;} 00030 00031 // character type operations 00032 int GetChTy(const char& Ch) const {return ChTyV[Ch-TCh::Mn];} 00033 bool IsEoln(const char& Ch) const {return (Ch==TCh::CrCh)||(Ch==TCh::LfCh);} 00034 bool IsWs(const char& Ch) const { 00035 return (Ch==' ')||(Ch==TCh::TabCh)||(Ch==TCh::CrCh)||(Ch==TCh::LfCh);} 00036 bool IsSpace(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctSpace;} 00037 bool IsAlpha(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctAlpha;} 00038 bool IsNum(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctNum;} 00039 bool IsAlNum(const char& Ch) const { 00040 return (int(ChTyV[Ch-TCh::Mn])==hlctAlpha)||(int(ChTyV[Ch-TCh::Mn])==hlctNum);} 00041 bool IsSym(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctSym;} 00042 bool IsUrl(const char& Ch) const { 00043 int ChTy=ChTyV[Ch-TCh::Mn]; 00044 return (ChTy==hlctAlpha)||(ChTy==hlctNum)|| 00045 (Ch=='.')||(Ch=='-')||(Ch==':')||(Ch=='/')||(Ch=='~');} 00046 00047 // upper/lower-case & escape-string operations 00048 bool IsUc(const char& Ch) const {return Ch==UcChV[Ch-TCh::Mn];} 00049 bool IsLc(const char& Ch) const {return Ch==LcChV[Ch-TCh::Mn];} 00050 char GetUc(const char& Ch) const {return UcChV[Ch-TCh::Mn];} 00051 char GetLc(const char& Ch) const {return LcChV[Ch-TCh::Mn];} 00052 void GetUcChA(TChA& ChA) const { 00053 for (int ChN=0; ChN<ChA.Len(); ChN++){ChA.PutCh(ChN, GetUc(ChA[ChN]));}} 00054 void GetLcChA(TChA& ChA) const { 00055 for (int ChN=0; ChN<ChA.Len(); ChN++){ChA.PutCh(ChN, GetLc(ChA[ChN]));}} 00056 TStr GetUcStr(const TStr& Str) const { 00057 TChA ChA(Str); GetUcChA(ChA); return ChA;} 00058 TStr GetLcStr(const TStr& Str) const { 00059 TChA ChA(Str); GetLcChA(ChA); return ChA;} 00060 TStr GetEscStr(const TStr& Str) const; 00061 00062 // standard entry points 00063 static PHtmlLxChDef ChDef; 00064 static PHtmlLxChDef GetChDef(){IAssert(!ChDef.Empty()); return ChDef;} 00065 static THtmlLxChDef& GetChDefRef(){IAssert(!ChDef.Empty()); return *ChDef;} 00066 00067 // character-set transformations 00068 static TStr GetCSZFromYuascii(const TChA& ChA); 00069 static TStr GetCSZFromWin1250(const TChA& ChA); 00070 static TStr GetWin1250FromYuascii(const TChA& ChA); 00071 static TStr GetIsoCeFromYuascii(const TChA& ChA); 00072 }; 00073 00075 // Html-Lexical 00076 typedef enum { 00077 hsyUndef, hsyStr, hsyNum, hsySSym, hsyUrl, 00078 hsyBTag, hsyETag, hsyMTag, hsyEof} THtmlLxSym; 00079 00080 class THtmlLx{ 00081 private: 00082 static THtmlLxChDef ChDef; 00083 PSIn SIn; 00084 TSIn& RSIn; 00085 bool DoParseArg; 00086 TChA ChStack; 00087 char Ch; 00088 int ChX; 00089 bool EscCh; 00090 TChA EscChA; 00091 TChA ArgNm; 00092 TChA ArgVal; 00093 void GetCh(){ 00094 if (ChStack.Empty()){ 00095 if (RSIn.Eof()){Ch=TCh::EofCh;} else {Ch=RSIn.GetCh(); ChX++;} 00096 } else { 00097 Ch=ChStack.Pop(); ChX++; 00098 } 00099 SymChA+=Ch; 00100 } 00101 void GetEscCh(); 00102 void GetMetaTag(); 00103 void GetTag(); 00104 public: 00105 THtmlLxSym Sym; 00106 int SymBChX, SymEChX; 00107 TChA ChA; 00108 TChA UcChA; 00109 TChA SymChA; 00110 int PreSpaces; 00111 TChA PreSpaceChA; 00112 typedef TStrKdV TArgNmValV; 00113 TArgNmValV ArgNmValV; 00114 public: 00115 THtmlLx(const PSIn& _SIn, const bool& _DoParseArg=true): 00116 SIn(_SIn), RSIn(*SIn), DoParseArg(_DoParseArg), 00117 ChStack(), Ch(' '), ChX(0), EscCh(false), 00118 EscChA(), ArgNm(), ArgVal(), 00119 Sym(hsyUndef), SymBChX(0), SymEChX(0), ChA(), UcChA(), 00120 PreSpaces(0), PreSpaceChA(), ArgNmValV(){} 00121 00122 THtmlLx& operator=(const THtmlLx&){Fail; return *this;} 00123 00124 void PutCh(const char& _Ch){ 00125 ChStack.Push(Ch); if (!SymChA.Empty()){SymChA.Pop();} Ch=_Ch; ChX--;} 00126 void PutStr(const TStr& Str){ 00127 for (int ChN=Str.Len()-1; ChN>=0; ChN--){PutCh(Str[ChN]);}} 00128 THtmlLxSym GetSym(); 00129 PHtmlTok GetTok(const bool& DoUc=true); 00130 TStr GetPreSpaceStr() const { 00131 return TStr::GetSpaceStr(PreSpaces);} 00132 00133 int GetArgs() const {return ArgNmValV.Len();} 00134 TStr GetArgNm(const int& ArgN) const {return ArgNmValV[ArgN].Key;} 00135 TStr GetArgVal(const int& ArgN) const {return ArgNmValV[ArgN].Dat;} 00136 bool IsArg(const TStr& ArgNm) const {return ArgNmValV.IsIn(TStrKd(ArgNm));} 00137 TStr GetArg(const TStr& ArgNm, const TStr& DfArgVal=TStr()) const { 00138 int ArgN=ArgNmValV.SearchForw(TStrKd(ArgNm)); 00139 if (ArgN==-1){return DfArgVal;} else {return ArgNmValV[ArgN].Dat;}} 00140 void PutArg(const TStr& ArgNm, const TStr& ArgVal){ 00141 int ArgN=ArgNmValV.SearchForw(TStrKd(ArgNm)); 00142 if (ArgN==-1){ArgNmValV.Add(TStrKd(ArgNm, ArgVal));} 00143 else {ArgNmValV[ArgN]=TStrKd(ArgNm, ArgVal);}} 00144 TStr GetFullBTagStr() const; 00145 00146 void MoveToStrOrEof(const TStr& Str); 00147 void MoveToBTagOrEof(const TStr& TagNm); 00148 void MoveToBTag2OrEof(const TStr& TagNm1, const TStr& TagNm2); 00149 void MoveToBTag3OrEof(const TStr& TagNm1, const TStr& TagNm2, const TStr& TagNm3); 00150 void MoveToBTagOrETagOrEof(const TStr& BTagNm, const TStr& ETagNm); 00151 void MoveToBTagArgOrEof( 00152 const TStr& TagNm, const TStr& ArgNm, const TStr& ArgVal); 00153 void MoveToBTagArg2OrEof(const TStr& TagNm, 00154 const TStr& ArgNm1, const TStr& ArgVal1, 00155 const TStr& ArgNm2, const TStr& ArgVal2, const bool& AndOpP=true); 00156 void MoveToBTagOrEof( 00157 const TStr& TagNm1, const TStr& ArgNm1, const TStr& ArgVal1, 00158 const TStr& TagNm2, const TStr& ArgNm2, const TStr& ArgVal2); 00159 void MoveToETagOrEof(const TStr& TagNm); 00160 TStr GetTextOnlyStrToEof(); 00161 TStr GetStrToBTag(const TStr& TagNm, const bool& TxtOnlyP=false); 00162 TStr GetStrToBTag(const TStr& TagNm, const TStr& ArgNm, 00163 const TStr& ArgVal, const bool& TxtOnlyP=false); 00164 TStr GetStrToETag(const TStr& TagNm, const bool& TxtOnlyP=false); 00165 TStr GetStrToETag2(const TStr& TagNm1, const TStr& TagNm2, const bool& TxtOnlyP=false); 00166 TStr GetStrInTag(const TStr& TagNm, const bool& TxtOnlyP=false); 00167 TStr GetHRefBeforeStr(const TStr& Str); 00168 bool IsGetBTag(const TStr& TagNm); 00169 bool IsGetETag(const TStr& TagNm); 00170 00171 static TStr GetSymStr(const THtmlLxSym& Sym); 00172 static TStr GetEscapedStr(const TChA& ChA); 00173 static TStr GetAsciiStr(const TChA& ChA, const char& GenericCh='_'); 00174 static void GetTokStrV(const TStr& Str, TStrV& TokStrV); 00175 static TStr GetNoTag(const TStr& Str); 00176 }; 00177 00179 // Html-Token 00180 ClassTPV(THtmlTok, PHtmlTok, THtmlTokV)//{ 00181 private: 00182 THtmlLxSym Sym; 00183 TStr Str; 00184 THtmlLx::TArgNmValV ArgNmValV; 00185 public: 00186 THtmlTok(): Sym(hsyUndef), Str(), ArgNmValV(){} 00187 THtmlTok(const THtmlLxSym& _Sym): 00188 Sym(_Sym), Str(), ArgNmValV(){} 00189 THtmlTok(const THtmlLxSym& _Sym, const TStr& _Str): 00190 Sym(_Sym), Str(_Str), ArgNmValV(){} 00191 THtmlTok(const THtmlLxSym& _Sym, const TStr& _Str, 00192 const THtmlLx::TArgNmValV& _ArgNmValV): 00193 Sym(_Sym), Str(_Str), ArgNmValV(_ArgNmValV){} 00194 THtmlTok(TSIn&){Fail;} 00195 static PHtmlTok Load(TSIn&){Fail; return NULL;} 00196 void Save(TSOut&){Fail;} 00197 00198 THtmlTok& operator=(const THtmlTok&){Fail; return *this;} 00199 00200 THtmlLxSym GetSym() const {return Sym;} 00201 TStr GetStr() const {return Str;} 00202 TStr GetFullStr() const; 00203 bool IsArg(const TStr& ArgNm) const { 00204 return ArgNmValV.SearchForw(TStrKd(ArgNm))!=-1;} 00205 TStr GetArg(const TStr& ArgNm) const { 00206 return ArgNmValV[ArgNmValV.SearchForw(TStrKd(ArgNm))].Dat;} 00207 TStr GetArg(const TStr& ArgNm, const TStr& DfArgVal) const { 00208 int ArgN=ArgNmValV.SearchForw(TStrKd(ArgNm)); 00209 if (ArgN==-1){return DfArgVal;} else {return ArgNmValV[ArgN].Dat;}} 00210 bool IsUrlTok(TStr& RelUrlStr) const; 00211 bool IsRedirUrlTok() const; 00212 00213 void SaveTxt(const PSOut& SOut, const bool& TxtMode=true); 00214 00215 static const TStr ATagNm; 00216 static const TStr AreaTagNm; 00217 static const TStr BrTagNm; 00218 static const TStr CardTagNm; 00219 static const TStr CenterTagNm; 00220 static const TStr FrameTagNm; 00221 static const TStr H1TagNm; 00222 static const TStr H2TagNm; 00223 static const TStr H3TagNm; 00224 static const TStr H4TagNm; 00225 static const TStr H5TagNm; 00226 static const TStr H6TagNm; 00227 static const TStr ImgTagNm; 00228 static const TStr LiTagNm; 00229 static const TStr MetaTagNm; 00230 static const TStr PTagNm; 00231 static const TStr UlTagNm; 00232 static const TStr TitleTagNm; 00233 static const TStr TitleETagNm; 00234 00235 static const TStr AltArgNm; 00236 static const TStr HRefArgNm; 00237 static const TStr SrcArgNm; 00238 static const TStr TitleArgNm; 00239 static const TStr HttpEquivArgNm; 00240 00241 static bool IsBreakTag(const TStr& TagNm); 00242 static bool IsBreakTok(const PHtmlTok& Tok); 00243 static bool IsHTag(const TStr& TagNm, int& HTagN); 00244 static PHtmlTok GetHTok(const bool& IsBTag, const int& HTagN); 00245 }; 00246 00248 // Html-Document 00249 typedef enum { 00250 hdtAll, hdtStr, hdtStrNum, hdtTag, hdtA, hdtHRef, hdtUL} THtmlDocType; 00251 00252 ClassTPV(THtmlDoc, PHtmlDoc, THtmlDocV)//{ 00253 private: 00254 THtmlTokV TokV; 00255 public: 00256 THtmlDoc(): TokV(){} 00257 THtmlDoc( 00258 const PSIn& SIn, const THtmlDocType& Type=hdtAll, const bool& DoUc=true); 00259 static PHtmlDoc New( 00260 const PSIn& SIn, const THtmlDocType& Type=hdtAll, const bool& DoUc=true){ 00261 return PHtmlDoc(new THtmlDoc(SIn, Type, DoUc));} 00262 THtmlDoc(TSIn&){Fail;} 00263 static PHtmlDoc Load(TSIn&){Fail; return NULL;} 00264 void Save(TSOut&){Fail;} 00265 00266 THtmlDoc& operator=(const THtmlDoc&){Fail; return *this;} 00267 00268 int GetToks() const {return TokV.Len();} 00269 PHtmlTok GetTok(const int& TokN) const {return TokV[TokN];} 00270 PHtmlTok GetTok(const int& TokN, THtmlLxSym& Sym, TStr& Str) const { 00271 Sym=TokV[TokN]->GetSym(); Str=TokV[TokN]->GetStr(); return TokV[TokN];} 00272 void AddTokV(const THtmlTokV& _TokV){TokV.AddV(_TokV);} 00273 00274 static TStr GetTxtLnDoc(const TStr& HtmlStr); 00275 static TStr GetTxtLnDoc(const TStr& HtmlStr, const TStr& BaseUrlStr, 00276 const bool& OutUrlP, const bool& OutTagsP); 00277 00278 static PHtmlDoc LoadTxt( 00279 const TStr& FNm, const THtmlDocType& Type=hdtAll, const bool& DoUc=true){ 00280 PSIn SIn=TFIn::New(FNm); return PHtmlDoc(new THtmlDoc(SIn, Type, DoUc));} 00281 void SaveTxt(const PSOut& SOut, const bool& TxtMode=true) const; 00282 00283 static void SaveHtmlToTxt( 00284 const TStr& HtmlStr, const PSOut& TxtSOut, const TStr& BaseUrlStr, 00285 const bool& OutUrlP, const bool& OutToksP); 00286 static void SaveHtmlToTxt( 00287 const TStr& HtmlStr, const TStr& TxtFNm, const TStr& BaseUrlStr, 00288 const bool& OutUrlP, const bool& OutToksP); 00289 static void SaveHtmlToXml( 00290 const TStr& HtmlStr, const PSOut& XmlSOut, const TStr& BaseUrlStr, 00291 const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP, 00292 const bool& OutTagsP, const bool& OutArgsP); 00293 static void SaveHtmlToXml( 00294 const TStr& HtmlStr, const TStr& XmlFNm, const TStr& BaseUrlStr, 00295 const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP, 00296 const bool& OutTagsP, const bool& OutArgsP); 00297 00298 static TLxSym GetLxSym(const THtmlLxSym& HtmlLxSym, const TChA& ChA); 00299 00300 static bool _IsTagRedir( 00301 const TStr& TagStr, const TStr& ArgNm, THtmlLx& Lx, 00302 const TStr& BaseUrlStr, const TStr& RedirUrlStr); 00303 static TStr GetRedirHtmlDocStr(const TStr& HtmlStr, 00304 const TStr& BaseUrlStr, const TStr& RedirUrlStr); 00305 }; 00306 00308 // Html-Hyper-Link-Document-Vector 00309 ClassTP(THtmlHldV, PHtmlHldV)//{ 00310 private: 00311 PHtmlDoc RefHtmlDoc; 00312 THtmlDocV HldV; 00313 public: 00314 THtmlHldV(const PHtmlDoc& _RefHtmlDoc, const int& HldWnLen=10); 00315 THtmlHldV(TSIn&){Fail;} 00316 static PHtmlHldV Load(TSIn&){Fail; return NULL;} 00317 void Save(TSOut&){Fail;} 00318 00319 THtmlHldV& operator=(const THtmlHldV&){Fail; return *this;} 00320 00321 PHtmlDoc GetRefHtmlDoc(){return RefHtmlDoc;} 00322 int GetHlds(){return HldV.Len();} 00323 PHtmlDoc GetHld(const int& HldN){return HldV[HldN];} 00324 }; 00325 00327 // Web-Page 00328 ClassTPV(TWebPg, PWebPg, TWebPgV)//{ 00329 private: 00330 TStrV UrlStrV; 00331 TStrV IpNumV; 00332 PHttpResp HttpResp; 00333 uint64 FetchMSecs; 00334 public: 00335 TWebPg(): UrlStrV(), IpNumV(), HttpResp(){} 00336 TWebPg(const TStrV& _UrlStrV, const TStrV& _IpNumV, const PHttpResp& _HttpResp): 00337 UrlStrV(_UrlStrV), IpNumV(_IpNumV), HttpResp(_HttpResp){} 00338 static PWebPg New(const TStrV& UrlStrV, const TStrV& IpNumV, const PHttpResp& HttpResp){ 00339 return new TWebPg(UrlStrV, IpNumV, HttpResp);} 00340 static PWebPg New(const TStrV& UrlStrV, const PHttpResp& HttpResp){ 00341 return new TWebPg(UrlStrV, TStrV(), HttpResp);} 00342 static PWebPg New(const TStr& UrlStr, const PHttpResp& HttpResp){ 00343 TStrV UrlStrV; UrlStrV.Add(UrlStr); 00344 return new TWebPg(UrlStrV, TStrV(), HttpResp);} 00345 ~TWebPg(){} 00346 TWebPg(TSIn&){Fail;} 00347 static PWebPg Load(TSIn&){Fail; return NULL;} 00348 void Save(TSOut&){Fail;} 00349 00350 TWebPg& operator=(const TWebPg&){Fail; return *this;} 00351 00352 int GetUrls() const {return UrlStrV.Len();} 00353 TStr GetUrlStr(const int& UrlN=-1) const { 00354 if (UrlN==-1){return UrlStrV.Last();} else {return UrlStrV[UrlN];}} 00355 PUrl GetUrl(const int& UrlN=-1) const { 00356 TStr UrlStr; 00357 if (UrlN==-1){UrlStr=UrlStrV.Last();} else {UrlStr=UrlStrV[UrlN];} 00358 return TUrl::New(UrlStr);} 00359 00360 int GetIps() const {return IpNumV.Len();} 00361 TStr GetIpNum(const int& IpN=-1) const { 00362 if (IpN==-1){return IpNumV.Last();} else {return IpNumV[IpN];}} 00363 00364 PHttpResp GetHttpResp() const {return HttpResp;} 00365 TStr GetHttpHdStr() const {return GetHttpResp()->GetHdStr();} 00366 TStr GetHttpBodyAsStr() const {return GetHttpResp()->GetBodyAsStr();} 00367 //void GetOutUrlStrV(TStrV& OutUrlStrV) const; 00368 void GetOutUrlV(TUrlV& OutUrlV, TUrlV& OutRedirUrlV) const; 00369 void GetOutUrlV(TUrlV& OutUrlV) const { 00370 TUrlV OutRedirUrlV; GetOutUrlV(OutUrlV, OutRedirUrlV);} 00371 void GetOutDescUrlStrKdV(TStrKdV& OutDescUrlStrKdV) const; 00372 00373 // fetch time 00374 void PutFetchMSecs(const uint64& _FetchMSecs){FetchMSecs=_FetchMSecs;} 00375 uint64 GetFetchMSecs() const {return FetchMSecs;} 00376 00377 void SaveAsHttpBody(const TStr& FNm) const; 00378 void SaveAsHttp(const TStr& FNm) const; 00379 00380 bool IsTxt() const; 00381 };