SNAP Library 6.0, Developer Reference  2020-12-09 16:24:20
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
THtmlDoc Class Reference

#include <html.h>

Collaboration diagram for THtmlDoc:

Public Member Functions

 THtmlDoc ()
 
 THtmlDoc (const PSIn &SIn, const THtmlDocType &Type=hdtAll, const bool &DoUc=true)
 
 THtmlDoc (TSIn &)
 
void Save (TSOut &)
 
THtmlDocoperator= (const THtmlDoc &)
 
int GetToks () const
 
PHtmlTok GetTok (const int &TokN) const
 
PHtmlTok GetTok (const int &TokN, THtmlLxSym &Sym, TStr &Str) const
 
void AddTokV (const THtmlTokV &_TokV)
 
void SaveTxt (const PSOut &SOut, const bool &TxtMode=true) const
 

Static Public Member Functions

static PHtmlDoc New (const PSIn &SIn, const THtmlDocType &Type=hdtAll, const bool &DoUc=true)
 
static PHtmlDoc Load (TSIn &)
 
static TStr GetTxtLnDoc (const TStr &HtmlStr)
 
static TStr GetTxtLnDoc (const TStr &HtmlStr, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutTagsP)
 
static PHtmlDoc LoadTxt (const TStr &FNm, const THtmlDocType &Type=hdtAll, const bool &DoUc=true)
 
static void SaveHtmlToTxt (const TStr &HtmlStr, const PSOut &TxtSOut, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutToksP)
 
static void SaveHtmlToTxt (const TStr &HtmlStr, const TStr &TxtFNm, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutToksP)
 
static void SaveHtmlToXml (const TStr &HtmlStr, const PSOut &XmlSOut, const TStr &BaseUrlStr, const bool &OutTextP, const bool &OutUrlP, const bool &OutToksP, const bool &OutTagsP, const bool &OutArgsP)
 
static void SaveHtmlToXml (const TStr &HtmlStr, const TStr &XmlFNm, const TStr &BaseUrlStr, const bool &OutTextP, const bool &OutUrlP, const bool &OutToksP, const bool &OutTagsP, const bool &OutArgsP)
 
static TLxSym GetLxSym (const THtmlLxSym &HtmlLxSym, const TChA &ChA)
 
static bool _IsTagRedir (const TStr &TagStr, const TStr &ArgNm, THtmlLx &Lx, const TStr &BaseUrlStr, const TStr &RedirUrlStr)
 
static TStr GetRedirHtmlDocStr (const TStr &HtmlStr, const TStr &BaseUrlStr, const TStr &RedirUrlStr)
 

Private Attributes

TCRef CRef
 
THtmlTokV TokV
 

Friends

class TPt< THtmlDoc >
 

Detailed Description

Definition at line 254 of file html.h.

Constructor & Destructor Documentation

THtmlDoc::THtmlDoc ( )
inline

Definition at line 258 of file html.h.

258 : TokV(){}
THtmlTokV TokV
Definition: html.h:256
THtmlDoc::THtmlDoc ( const PSIn SIn,
const THtmlDocType Type = hdtAll,
const bool &  DoUc = true 
)

Definition at line 779 of file html.cpp.

References TVec< TVal, TSizeTy >::Add(), THtmlTok::AreaTagNm, Fail, THtmlLx::GetSym(), THtmlLx::GetTok(), hdtA, hdtAll, hdtHRef, hdtStr, hdtStrNum, hdtTag, hdtUL, hsyBTag, hsyEof, hsyETag, hsyNum, hsyStr, THtmlTok::ImgTagNm, THtmlLx::Sym, TokV, THtmlLx::UcChA, and THtmlTok::UlTagNm.

779  :
780  TokV(1000, 0){
781  THtmlLx Lx(SIn);
782  bool MkTok=false; bool InUL=false;
783  while (Lx.GetSym()!=hsyEof){
784  switch (Type){
785  case hdtAll: MkTok=true; break;
786  case hdtStr: MkTok=(Lx.Sym==hsyStr); break;
787  case hdtStrNum: MkTok=(Lx.Sym==hsyStr)||(Lx.Sym==hsyNum); break;
788  case hdtTag: MkTok=(Lx.Sym==hsyBTag)||(Lx.Sym==hsyETag); break;
789  case hdtA: MkTok=(Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::ATagNm); break;
790  case hdtHRef:
791  MkTok=(Lx.Sym==hsyBTag)&&
792  ((Lx.UcChA==THtmlTok::ATagNm)||(Lx.UcChA==THtmlTok::AreaTagNm)||
793  (Lx.UcChA==THtmlTok::FrameTagNm)||(Lx.UcChA==THtmlTok::ImgTagNm)||
794  (Lx.UcChA==THtmlTok::MetaTagNm));
795  break;
796  case hdtUL:
797  if ((Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=true;}
798  MkTok=InUL;
799  if ((Lx.Sym==hsyETag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=false;}
800  break;
801  default: Fail;
802  }
803  if (MkTok){TokV.Add(Lx.GetTok(DoUc));}
804  }
806 }
Definition: html.h:252
static const TStr FrameTagNm
Definition: html.h:222
Definition: html.h:252
Definition: html.h:252
Definition: html.h:182
#define Fail
Definition: bd.h:238
Definition: html.h:79
static const TStr MetaTagNm
Definition: html.h:231
static const TStr AreaTagNm
Definition: html.h:218
Definition: html.h:252
Definition: html.h:252
static const TStr ATagNm
Definition: html.h:217
THtmlTokV TokV
Definition: html.h:256
static const TStr UlTagNm
Definition: html.h:233
Definition: html.h:79
Definition: html.h:80
static const TStr ImgTagNm
Definition: html.h:229
Definition: html.h:80
TPt< THtmlTok > PHtmlTok
Definition: html.h:5
Definition: html.h:252
Definition: html.h:80
Definition: html.h:82
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602

Here is the call graph for this function:

THtmlDoc::THtmlDoc ( TSIn )
inline

Definition at line 264 of file html.h.

References Fail.

264 {Fail;}
#define Fail
Definition: bd.h:238

Member Function Documentation

bool THtmlDoc::_IsTagRedir ( const TStr TagStr,
const TStr ArgNm,
THtmlLx Lx,
const TStr BaseUrlStr,
const TStr RedirUrlStr 
)
static

Definition at line 1106 of file html.cpp.

References THtmlLx::ChA, THtmlLx::GetArg(), TUrlEnv::GetFullUrlStr(), TUrl::GetUrlStr(), hsyBTag, IAssert, THtmlLx::IsArg(), TUrl::IsOk(), TUrl::New(), TUrlEnv::New(), THtmlLx::PutArg(), THtmlLx::Sym, and usHttp.

Referenced by GetRedirHtmlDocStr().

1108  {
1109  IAssert(Lx.Sym==hsyBTag);
1110  if ((Lx.ChA==TagStr)&&(Lx.IsArg(ArgNm))){
1111  TStr RelUrlStr=Lx.GetArg(ArgNm);
1112  PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
1113  if (Url->IsOk(usHttp)){
1114  TStr UrlStr=Url->GetUrlStr();
1115  PUrlEnv RedirUrlEnv=TUrlEnv::New(RedirUrlStr, "url", UrlStr);
1116  Lx.PutArg(ArgNm, RedirUrlEnv->GetFullUrlStr());
1117  return true;
1118  } else {
1119  return false;
1120  }
1121  } else {
1122  return false;
1123  }
1124 }
#define IAssert(Cond)
Definition: bd.h:262
Definition: url.h:5
static PUrl New(const TStr &RelUrlStr, const TStr &BaseUrlStr=TStr())
Definition: url.h:25
bool IsArg(const TStr &ArgNm) const
Definition: html.h:138
void PutArg(const TStr &ArgNm, const TStr &ArgVal)
Definition: html.h:142
TStr GetArg(const TStr &ArgNm, const TStr &DfArgVal=TStr()) const
Definition: html.h:139
static PUrlEnv New()
Definition: url.h:113
TChA ChA
Definition: html.h:109
Definition: html.h:80
Definition: dt.h:412
THtmlLxSym Sym
Definition: html.h:107
Definition: bd.h:196
TStr GetFullUrlStr() const
Definition: url.cpp:445

Here is the call graph for this function:

Here is the caller graph for this function:

void THtmlDoc::AddTokV ( const THtmlTokV _TokV)
inline

Definition at line 274 of file html.h.

Referenced by THtmlHldV::THtmlHldV().

274 {TokV.AddV(_TokV);}
THtmlTokV TokV
Definition: html.h:256
TSizeTy AddV(const TVec< TVal, TSizeTy > &ValV)
Adds the elements of the vector ValV to the to end of the vector.
Definition: ds.h:1110

Here is the caller graph for this function:

TLxSym THtmlDoc::GetLxSym ( const THtmlLxSym HtmlLxSym,
const TChA ChA 
)
static

Definition at line 1092 of file html.cpp.

References Fail, TLxSymStr::GetSSym(), hsyBTag, hsyEof, hsyETag, hsyNum, hsySSym, hsyStr, hsyUndef, hsyUrl, syEof, syFlt, syStr, and syUndef.

1092  {
1093  switch (HtmlLxSym){
1094  case hsyUndef: return syUndef;
1095  case hsyStr: return syStr;
1096  case hsyNum: return syFlt;
1097  case hsySSym: return TLxSymStr::GetSSym(ChA);
1098  case hsyUrl: return syStr;
1099  case hsyBTag: return syStr;
1100  case hsyETag: return syStr;
1101  case hsyEof: return syEof;
1102  default: Fail; return syUndef;
1103  }
1104 }
Definition: html.h:79
#define Fail
Definition: bd.h:238
Definition: html.h:79
Definition: lx.h:45
Definition: lx.h:51
Definition: html.h:79
Definition: lx.h:45
Definition: html.h:79
Definition: html.h:80
static TLxSym GetSSym(const TStr &Str)
Definition: lx.cpp:186
Definition: html.h:80
Definition: html.h:79
Definition: html.h:80
Definition: lx.h:45

Here is the call graph for this function:

TStr THtmlDoc::GetRedirHtmlDocStr ( const TStr HtmlStr,
const TStr BaseUrlStr,
const TStr RedirUrlStr 
)
static

Definition at line 1126 of file html.cpp.

References _IsTagRedir(), TMOut::GetAsStr(), THtmlLx::GetFullBTagStr(), THtmlLx::GetSym(), hsyBTag, hsyEof, TStrIn::New(), THtmlLx::PreSpaceChA, TSOut::PutStr(), THtmlLx::Sym, and THtmlLx::SymChA.

1127  {
1128  PSIn SIn=TStrIn::New(HtmlStr);
1129  TMOut SOut;
1130  THtmlLx Lx(SIn);
1131  while (Lx.GetSym()!=hsyEof){
1132  SOut.PutStr(Lx.PreSpaceChA);
1133  if ((Lx.Sym==hsyBTag)&&(
1134  (_IsTagRedir(THtmlTok::ATagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))||
1135  (_IsTagRedir(THtmlTok::AreaTagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))||
1136  (_IsTagRedir(THtmlTok::FrameTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr))||
1137  (_IsTagRedir(THtmlTok::ImgTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr)))){
1138  SOut.PutStr(Lx.GetFullBTagStr());
1139  } else {
1140  SOut.PutStr(Lx.SymChA());
1141  }
1142  }
1143  return SOut.GetAsStr();
1144 }
static const TStr FrameTagNm
Definition: html.h:222
static const TStr HRefArgNm
Definition: html.h:238
TStr GetAsStr() const
Definition: fl.cpp:869
static const TStr AreaTagNm
Definition: html.h:218
static const TStr ATagNm
Definition: html.h:217
static PSIn New(const TStr &Str)
Definition: dt.h:711
Definition: fl.h:495
static const TStr ImgTagNm
Definition: html.h:229
Definition: html.h:80
int PutStr(const char *CStr)
Definition: fl.cpp:117
Definition: html.h:80
Definition: html.h:82
static const TStr SrcArgNm
Definition: html.h:239
static bool _IsTagRedir(const TStr &TagStr, const TStr &ArgNm, THtmlLx &Lx, const TStr &BaseUrlStr, const TStr &RedirUrlStr)
Definition: html.cpp:1106

Here is the call graph for this function:

PHtmlTok THtmlDoc::GetTok ( const int &  TokN) const
inline

Definition at line 271 of file html.h.

Referenced by TWebPg::GetOutDescUrlStrKdV(), TWebPg::GetOutUrlV(), and THtmlHldV::THtmlHldV().

271 {return TokV[TokN];}
THtmlTokV TokV
Definition: html.h:256

Here is the caller graph for this function:

PHtmlTok THtmlDoc::GetTok ( const int &  TokN,
THtmlLxSym Sym,
TStr Str 
) const
inline

Definition at line 272 of file html.h.

References TStr::GetStr().

272  {
273  Sym=TokV[TokN]->GetSym(); Str=TokV[TokN]->GetStr(); return TokV[TokN];}
THtmlTokV TokV
Definition: html.h:256

Here is the call graph for this function:

int THtmlDoc::GetToks ( ) const
inline

Definition at line 270 of file html.h.

Referenced by TWebPg::GetOutDescUrlStrKdV(), TWebPg::GetOutUrlV(), and THtmlHldV::THtmlHldV().

270 {return TokV.Len();}
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
THtmlTokV TokV
Definition: html.h:256

Here is the caller graph for this function:

TStr THtmlDoc::GetTxtLnDoc ( const TStr HtmlStr)
static

Definition at line 808 of file html.cpp.

References THtmlLx::ChA, TStr::CStr(), TChA::Empty(), THtmlLx::GetSym(), hsyBTag, hsyEof, hsyETag, hsyNum, hsySSym, hsyStr, TChA::LastCh(), TStrIn::New(), THtmlLx::PreSpaces, and THtmlLx::Sym.

Referenced by SaveHtmlToTxt().

808  {
809  TChA LnDocChA;
810  // prepare html parsing
811  PSIn HtmlSIn=TStrIn::New(HtmlStr);
812  THtmlLx HtmlLx(HtmlSIn);
813  bool InScript=false;
814  // save text
815  while (HtmlLx.GetSym()!=hsyEof){
816  TStr Str=HtmlLx.ChA;
817  switch (HtmlLx.Sym){
818  case hsyStr:
819  case hsyNum:
820  case hsySSym:
821  if (InScript){break;}
822  if (HtmlLx.PreSpaces>0){LnDocChA+=' ';}
823  LnDocChA+=Str.CStr();
824  break;
825  case hsyBTag:
826  if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';}
827  if ((!InScript)&&(Str=="<SCRIPT>")){InScript=true;}
828  break;
829  case hsyETag:
830  if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';}
831  if ((InScript)&&(Str=="<SCRIPT>")){InScript=false;}
832  break;
833  default: break;
834  }
835  }
836  // return result
837  return LnDocChA;
838 }
Definition: html.h:79
bool Empty() const
Definition: dt.h:260
Definition: html.h:79
static PSIn New(const TStr &Str)
Definition: dt.h:711
char LastCh() const
Definition: dt.h:281
Definition: dt.h:201
Definition: html.h:79
Definition: html.h:80
Definition: html.h:80
Definition: dt.h:412
Definition: html.h:80
Definition: html.h:82
char * CStr()
Definition: dt.h:479

Here is the call graph for this function:

Here is the caller graph for this function:

TStr THtmlDoc::GetTxtLnDoc ( const TStr HtmlStr,
const TStr BaseUrlStr,
const bool &  OutUrlP,
const bool &  OutTagsP 
)
static

Definition at line 840 of file html.cpp.

References THtmlLx::ChA, Fail, TStr::GetSubStr(), THtmlLx::GetSym(), THtmlLx::GetTok(), TUrl::GetUrlStr(), TXmlLx::GetXmlStrFromPlainStr(), hsyBTag, hsyEof, hsyETag, hsyMTag, hsyNum, hsySSym, hsyStr, hsyUndef, hsyUrl, TUrl::IsOk(), TChA::LastCh(), TStr::Len(), TUrl::New(), TStrIn::New(), THtmlLx::PreSpaces, and THtmlLx::Sym.

841  {
842  // prepare output-string
843  TChA OutChA; OutChA+=' ';
844  // prepare html parsing
845  PSIn HtmlSIn=TStrIn::New(HtmlStr);
846  THtmlLx HtmlLx(HtmlSIn);
847  bool InScript=false;
848  // save text
849  while (HtmlLx.GetSym()!=hsyEof){
850  TStr Str=HtmlLx.ChA;
851  switch (HtmlLx.Sym){
852  case hsyUndef:
853  case hsyUrl:
854  case hsyMTag:
855  break;
856  case hsyStr:
857  case hsyNum:
858  case hsySSym:
859  if (InScript){break;}
860  if (HtmlLx.PreSpaces>0){if (OutChA.LastCh()!=' '){OutChA+=' ';}}
861  OutChA+=Str;
862  break;
863  case hsyBTag:
864  // extract tag name
865  Str=Str.GetSubStr(1, Str.Len()-2);
866  // process tag
867  if (!InScript){
868  // check script tag
869  if (Str=="SCRIPT"){
870  InScript=true; break;}
871  // output tag
872  if (OutTagsP){
873  OutChA+='<'; OutChA+=Str; OutChA+='>';
874  } else {
875  if (OutChA.LastCh()!=' '){OutChA+=' ';}
876  }
877  // check if URL present
878  PHtmlTok Tok=HtmlLx.GetTok();
879  TStr RelUrlStr;
880  if (Tok->IsUrlTok(RelUrlStr)){
881  PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
882  if (Url->IsOk()){
883  if (OutUrlP){
884  TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr());
885  OutChA+="<Url>"; OutChA+=XmlUrlStr; OutChA+="</Url>";
886  }
887  }
888  }
889  }
890  break;
891  case hsyETag:
892  // extract tag name
893  Str=Str.GetSubStr(1, Str.Len()-2);
894  // process tag
895  if (InScript){
896  if (Str=="SCRIPT"){
897  InScript=false; break;}
898  } else {
899  if (OutTagsP){
900  OutChA+="</"; OutChA+=Str; OutChA+='>';
901  } else {
902  if (OutChA.LastCh()!=' '){OutChA+=' ';}
903  }
904  }
905  break;
906  case hsyEof: break;
907  default: Fail;
908  }
909  }
910  // return string
911  return OutChA;
912 }
int Len() const
Definition: dt.h:490
Definition: html.h:79
#define Fail
Definition: bd.h:238
Definition: html.h:79
static PUrl New(const TStr &RelUrlStr, const TStr &BaseUrlStr=TStr())
Definition: url.h:25
TStr GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:811
static PSIn New(const TStr &Str)
Definition: dt.h:711
char LastCh() const
Definition: dt.h:281
Definition: html.h:80
Definition: html.h:79
Definition: dt.h:201
Definition: html.h:79
Definition: html.h:80
Definition: html.h:80
Definition: dt.h:412
Definition: html.h:79
Definition: html.h:80
Definition: html.h:82
static TStr GetXmlStrFromPlainStr(const TChA &PlainChA)
Definition: xml.cpp:968

Here is the call graph for this function:

static PHtmlDoc THtmlDoc::Load ( TSIn )
inlinestatic

Definition at line 265 of file html.h.

References Fail.

265 {Fail; return NULL;}
#define Fail
Definition: bd.h:238
static PHtmlDoc THtmlDoc::LoadTxt ( const TStr FNm,
const THtmlDocType Type = hdtAll,
const bool &  DoUc = true 
)
inlinestatic

Definition at line 280 of file html.h.

References TFIn::New().

281  {
282  PSIn SIn=TFIn::New(FNm); return PHtmlDoc(new THtmlDoc(SIn, Type, DoUc));}
THtmlDoc()
Definition: html.h:258
TPt< THtmlDoc > PHtmlDoc
Definition: html.h:6
static PSIn New(const TStr &FNm)
Definition: fl.cpp:290

Here is the call graph for this function:

static PHtmlDoc THtmlDoc::New ( const PSIn SIn,
const THtmlDocType Type = hdtAll,
const bool &  DoUc = true 
)
inlinestatic

Definition at line 261 of file html.h.

Referenced by TWebPg::GetOutDescUrlStrKdV(), and TWebPg::GetOutUrlV().

262  {
263  return PHtmlDoc(new THtmlDoc(SIn, Type, DoUc));}
THtmlDoc()
Definition: html.h:258
TPt< THtmlDoc > PHtmlDoc
Definition: html.h:6

Here is the caller graph for this function:

THtmlDoc& THtmlDoc::operator= ( const THtmlDoc )
inline

Definition at line 268 of file html.h.

References Fail.

268 {Fail; return *this;}
#define Fail
Definition: bd.h:238
void THtmlDoc::Save ( TSOut )
inline

Definition at line 266 of file html.h.

References Fail.

266 {Fail;}
#define Fail
Definition: bd.h:238
void THtmlDoc::SaveHtmlToTxt ( const TStr HtmlStr,
const PSOut TxtSOut,
const TStr BaseUrlStr,
const bool &  OutUrlP,
const bool &  OutToksP 
)
static

Definition at line 928 of file html.cpp.

References GetTxtLnDoc(), and TStr::SaveTxt().

Referenced by SaveHtmlToTxt().

930  {
931  // get text-string from html-string
932  TStr TxtStr=GetTxtLnDoc(HtmlStr, BaseUrlStr, OutUrlP, OutTagsP);
933  // save text-string
934  TxtStr.SaveTxt(TxtSOut);
935 }
Definition: dt.h:412
static TStr GetTxtLnDoc(const TStr &HtmlStr)
Definition: html.cpp:808
void SaveTxt(const PSOut &SOut) const
Definition: dt.h:673

Here is the call graph for this function:

Here is the caller graph for this function:

void THtmlDoc::SaveHtmlToTxt ( const TStr HtmlStr,
const TStr TxtFNm,
const TStr BaseUrlStr,
const bool &  OutUrlP,
const bool &  OutToksP 
)
static

Definition at line 937 of file html.cpp.

References TFOut::New(), and SaveHtmlToTxt().

939  {
940  // create output file
941  PSOut TxtSOut=TFOut::New(TxtFNm);
942  // save to output file
943  SaveHtmlToTxt(HtmlStr, TxtSOut, BaseUrlStr, OutUrlP, OutTagsP);
944 }
static PSOut New(const TStr &FNm, const bool &Append=false)
Definition: fl.cpp:442
static void SaveHtmlToTxt(const TStr &HtmlStr, const PSOut &TxtSOut, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutToksP)
Definition: html.cpp:928
Definition: bd.h:196

Here is the call graph for this function:

void THtmlDoc::SaveHtmlToXml ( const TStr HtmlStr,
const PSOut XmlSOut,
const TStr BaseUrlStr,
const bool &  OutTextP,
const bool &  OutUrlP,
const bool &  OutToksP,
const bool &  OutTagsP,
const bool &  OutArgsP 
)
static

Definition at line 946 of file html.cpp.

References TVec< TVal, TSizeTy >::Add(), THtmlLx::ChA, TChA::Clr(), TChA::CStr(), TStr::CStr(), TChA::Empty(), TStr::Empty(), Fail, THtmlLx::GetArgNm(), THtmlLx::GetArgs(), THtmlLx::GetArgVal(), TSOut::GetFileId(), TStr::GetSubStr(), THtmlLx::GetSym(), THtmlLx::GetTok(), TUrl::GetUrlStr(), TXmlLx::GetXmlStrFromPlainStr(), hsyBTag, hsyEof, hsyETag, hsyMTag, hsyNum, hsySSym, hsyStr, hsyUndef, hsyUrl, TUrl::IsOk(), TStr::Len(), TVec< TVal, TSizeTy >::Len(), TUrl::New(), TStrIn::New(), and THtmlLx::Sym.

Referenced by SaveHtmlToXml().

949  {
950  // prepare output-file-id
951  TFileId fXml=XmlSOut->GetFileId();
952  // create outgoing url
953  TStrV OutUrlStrV;
954  // open top tag
955  fprintf(fXml, "<HtmlDoc>\n");
956  // save url
957  if (!BaseUrlStr.Empty()){
958  TStr XmlBaseUrlStr=TXmlLx::GetXmlStrFromPlainStr(BaseUrlStr);
959  fprintf(fXml, "<BaseUrl>%s</BaseUrl>\n", XmlBaseUrlStr.CStr());
960  }
961  // prepare html parsing
962  PSIn HtmlSIn=TStrIn::New(HtmlStr);
963  THtmlLx HtmlLx(HtmlSIn);
964  TChA ContTextChA; bool InScript=false;
965  // save text
966  fprintf(fXml, "<Body>\n");
967  while (HtmlLx.GetSym()!=hsyEof){
968  TStr Str=HtmlLx.ChA;
969  switch (HtmlLx.Sym){
970  case hsyUndef:
971  case hsyUrl:
972  case hsyMTag:
973  break;
974  case hsyStr:
975  if (InScript){break;}
977  if (OutToksP){
978  fprintf(fXml, " <Str>%s</Str>\n", Str.CStr());}
979  if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
980  break;
981  case hsyNum:
982  if (InScript){break;}
984  if (OutToksP){
985  fprintf(fXml, " <Num>%s</Num>\n", Str.CStr());}
986  if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
987  break;
988  case hsySSym:
989  if (InScript){break;}
991  if (OutToksP){
992  fprintf(fXml, " <Sym>%s</Sym>\n", Str.CStr());}
993  if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
994  break;
995  case hsyBTag:{
996  // save continuos text
997  if (!ContTextChA.Empty()){
998  if (OutTextP){
999  fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());}
1000  ContTextChA.Clr();
1001  }
1002  // extract tag name
1003  Str=Str.GetSubStr(1, Str.Len()-2);
1005  // process tag
1006  if (!InScript){
1007  // check script tag
1008  if (Str=="SCRIPT"){
1009  InScript=true; break;}
1010  // output tag
1011  if (OutTagsP){
1012  if (OutArgsP){
1013  fprintf(fXml, " <BTag Nm=\"%s\">\n", Str.CStr());
1014  for (int ArgN=0; ArgN<HtmlLx.GetArgs(); ArgN++){
1015  TStr ArgNm=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgNm(ArgN));
1016  TStr ArgVal=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgVal(ArgN));
1017  fprintf(fXml, " <Arg Nm=\"%s\" Val=\"%s\"/>", ArgNm.CStr(), ArgVal.CStr());
1018  }
1019  fprintf(fXml, " </BTag>\n");
1020  } else {
1021  fprintf(fXml, " <BTag Nm=\"%s\"/>\n", Str.CStr());
1022  }
1023  }
1024  // check if URL present
1025  PHtmlTok Tok=HtmlLx.GetTok();
1026  TStr RelUrlStr;
1027  if (Tok->IsUrlTok(RelUrlStr)){
1028  PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
1029  if (Url->IsOk()){
1030  OutUrlStrV.Add(Url->GetUrlStr());
1031  if (OutUrlP){
1032  TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr());
1033  fprintf(fXml, " <Url>%s</Url>\n", XmlUrlStr.CStr());
1034  }
1035  }
1036  }
1037  }
1038  break;}
1039  case hsyETag:{
1040  // save continuos text
1041  if (!ContTextChA.Empty()){
1042  if (OutTextP){
1043  fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());}
1044  ContTextChA.Clr();
1045  }
1046  // extract tag name
1047  Str=Str.GetSubStr(1, Str.Len()-2);
1049  // process tag
1050  if (InScript){
1051  if (Str=="SCRIPT"){
1052  InScript=false; break;}
1053  } else {
1054  if (OutTagsP){
1055  fprintf(fXml, " <ETag Nm=\"%s\"/>\n", Str.CStr());}
1056  }
1057  break;}
1058  case hsyEof: break;
1059  default: Fail;
1060  }
1061  }
1062  // save continuos text
1063  if (!ContTextChA.Empty()){
1064  if (OutTextP){
1065  fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());}
1066  ContTextChA.Clr();
1067  }
1068  fprintf(fXml, "</Body>\n");
1069  // save outgoing urls
1070  fprintf(fXml, "<OutUrls>\n");
1071  for (int UrlN=0; UrlN<OutUrlStrV.Len(); UrlN++){
1072  TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(OutUrlStrV[UrlN]);
1073  fprintf(fXml, " <Url N=\"%d\">%s</Url>\n", 1+UrlN, XmlUrlStr.CStr());
1074  }
1075  fprintf(fXml, "</OutUrls>\n");
1076 
1077  // close top tag
1078  fprintf(fXml, "</HtmlDoc>\n");
1079 }
int Len() const
Definition: dt.h:490
Definition: html.h:79
bool Empty() const
Definition: dt.h:260
#define Fail
Definition: bd.h:238
Definition: html.h:79
void Clr()
Definition: dt.h:258
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
static PUrl New(const TStr &RelUrlStr, const TStr &BaseUrlStr=TStr())
Definition: url.h:25
TStr GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:811
char * CStr()
Definition: dt.h:255
static PSIn New(const TStr &Str)
Definition: dt.h:711
Definition: html.h:80
Definition: html.h:79
Definition: dt.h:201
Definition: html.h:79
Definition: html.h:80
Definition: html.h:80
Definition: dt.h:412
bool Empty() const
Definition: dt.h:491
Definition: html.h:79
FILE * TFileId
Definition: bd.h:17
Definition: html.h:80
Definition: html.h:82
char * CStr()
Definition: dt.h:479
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
static TStr GetXmlStrFromPlainStr(const TChA &PlainChA)
Definition: xml.cpp:968

Here is the call graph for this function:

Here is the caller graph for this function:

void THtmlDoc::SaveHtmlToXml ( const TStr HtmlStr,
const TStr XmlFNm,
const TStr BaseUrlStr,
const bool &  OutTextP,
const bool &  OutUrlP,
const bool &  OutToksP,
const bool &  OutTagsP,
const bool &  OutArgsP 
)
static

Definition at line 1081 of file html.cpp.

References TFOut::New(), and SaveHtmlToXml().

1084  {
1085  // create output file
1086  PSOut XmlSOut=TFOut::New(XmlFNm);
1087  // save to output file
1088  SaveHtmlToXml(HtmlStr, XmlSOut, BaseUrlStr, OutTextP, OutUrlP,
1089  OutToksP, OutTagsP, OutArgsP);
1090 }
static PSOut New(const TStr &FNm, const bool &Append=false)
Definition: fl.cpp:442
static void SaveHtmlToXml(const TStr &HtmlStr, const PSOut &XmlSOut, const TStr &BaseUrlStr, const bool &OutTextP, const bool &OutUrlP, const bool &OutToksP, const bool &OutTagsP, const bool &OutArgsP)
Definition: html.cpp:946
Definition: bd.h:196

Here is the call graph for this function:

void THtmlDoc::SaveTxt ( const PSOut SOut,
const bool &  TxtMode = true 
) const

Definition at line 915 of file html.cpp.

References TInt::GetStr(), TVec< TVal, TSizeTy >::Len(), TSOut::PutLn(), TSOut::PutStr(), and TokV.

Referenced by THtmlHldV::THtmlHldV().

915  {
916  if (TxtMode){
917  for (int TokN=0; TokN<TokV.Len(); TokN++){TokV[TokN]->SaveTxt(SOut);}
918  SOut->PutLn();
919  } else {
920  for (int TokN=0; TokN<TokV.Len(); TokN++){
921  SOut->PutStr(TInt::GetStr(TokN)); SOut->PutStr(": ");
922  TokV[TokN]->SaveTxt(SOut);
923  SOut->PutLn();
924  }
925  }
926 }
TStr GetStr() const
Definition: dt.h:1200
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
THtmlTokV TokV
Definition: html.h:256

Here is the call graph for this function:

Here is the caller graph for this function:

Friends And Related Function Documentation

friend class TPt< THtmlDoc >
friend

Definition at line 254 of file html.h.

Member Data Documentation

TCRef THtmlDoc::CRef
private

Definition at line 254 of file html.h.

THtmlTokV THtmlDoc::TokV
private

Definition at line 256 of file html.h.

Referenced by SaveTxt(), and THtmlDoc().


The documentation for this class was generated from the following files: