5 for (
int i = 1; i < CdfV.
Len(); i++) {
6 CdfV[i].Val2 = CdfV[i-1].Val2 + CdfV[i].Val2; }
11 for (
int i = 1; i < CdfV.
Len(); i++) {
12 CdfV[i].Val2 = CdfV[i-1].Val2 + CdfV[i].Val2; }
17 for (
int i = 1; i < CdfV.
Len(); i++) {
18 CdfV[i].Dat = CdfV[i-1].Dat + CdfV[i].Dat; }
35 for (
int i = CCdfV.
Len()-2; i >= 0; i--) {
36 CCdfV[i].Val2 = CCdfV[i+1].Val2 + CCdfV[i].Val2; }
41 for (
int i = CCdfV.
Len()-2; i >= 0; i--) {
42 CCdfV[i].Val2 = CCdfV[i+1].Val2 + CCdfV[i].Val2; }
47 for (
int i = CCdfV.
Len()-2; i >= 0; i--) {
48 CCdfV[i].Dat = CCdfV[i+1].Dat + CCdfV[i].Dat; }
65 for (
int i = PdfV.
Len()-1; i > 0; i--) {
66 PdfV[i].Val2 = PdfV[i].Val2 - PdfV[i-1].Val2; }
71 for (
int i = PdfV.
Len()-1; i > 0; i--) {
72 PdfV[i].Val2 = PdfV[i].Val2 - PdfV[i-1].Val2; }
77 for (
int i = PdfV.
Len()-1; i > 0; i--) {
78 PdfV[i].Dat = PdfV[i].Dat - PdfV[i-1].Dat; }
83 for (
int i = 0; i < PdfV.
Len(); i++) {
84 Sum += PdfV[i].Val2; }
85 if (Sum <= 0.0) {
return; }
86 for (
int i = 0; i < PdfV.
Len(); i++) {
87 PdfV[i].Val2 /= Sum; }
92 for (
int i = 0; i < PdfV.
Len(); i++) {
94 if (Sum <= 0.0) {
return; }
95 for (
int i = 0; i < PdfV.
Len(); i++) {
110 for (
int i = 0; i < YValV.
Len(); ) {
111 ExpYValV.
Add(YValV[i]);
112 i = int(i*BinFactor);
113 if (i==prevI) { i++; }
121 for (
int i = 0; i < YValV.
Len(); ) {
122 ExpYValV.
Add(YValV[i]);
123 i = int(i*BinFactor);
124 if (i==prevI) { i++; }
188 int EndSlash = UrlChA.
SearchCh(
'/', 7)-1;
190 const int BegSlash = UrlChA.
SearchChBack(
'/', EndSlash);
191 if (BegSlash > 0) {
return UrlChA.
GetSubStr(BegSlash+1, EndSlash).
ToLc(); }
196 if (EndSlash > 0) {
return UrlChA.
GetSubStr(0, EndSlash-1).
ToLc(); }
197 else {
return TChA(UrlChA).
ToLc(); }
208 const char *c = Url.
CStr();
210 while (*c && cnt != Count) {
211 if (*c == Ch) { cnt++; }
214 return int(c-Url.
CStr()-1);
221 if (DomNm ==
"blog.myspace.com") {
248 if (DomNm==
"blogs.msdn.com" || DomNm==
"ameblo.jp" || DomNm==
"xfruits.com" || DomNm==
"scienceblogs.com" || DomNm==
"blogs.sun.com"
249 || DomNm==
"blog.wired.com" || DomNm==
"weblogs.asp.net" || DomNm==
"blogs.technet.com" || DomNm==
"blogs.guardian.co"
250 || DomNm==
"blogs.clarin.com" || DomNm==
"blogs.sun.com" || DomNm==
"blog.wired.com" || DomNm==
"weblogs.asp.net"
251 || DomNm==
"blogs.technet.com" || DomNm==
"blogs.guardian.com" || DomNm==
"blogs.clarin.com" || DomNm==
"blogs.zdnet.com"
252 || DomNm==
"blogs.citypages.com" || DomNm==
"voices.washingtonpost.com" || DomNm==
"blog.tv2.dk"
253 || DomNm==
"blogs.menomoneefallsnow.com" || DomNm==
"weblogs.baltimoresun.com" || DomNm==
"eonline.com") {
258 if (DomNm ==
"digg.com") {
259 if (PostUrlStr.
IsPrefix(
"http://digg.com/submit?")) {
260 const int Url = PostUrlStr.
SearchStr(
";url=");
271 if (PostUrlStr.
IsPrefix(
"http://nydailynews.com/blogs/") || PostUrlStr.
IsPrefix(
"http://bbc.co.uk/blogs/")
272 || PostUrlStr.
IsPrefix(
"http://nydailynews.com/blogs/") || PostUrlStr.
IsPrefix(
"http://newsbusters.org/blogs/")) {
276 if (DomNm==
"feeds.feedburner.com") {
280 if (DomNm==
"groups.google.com") {
284 if (DomNm==
"news.google.com") {
285 const int UrlPos = PostUrlStr.
SearchStr(
"&url=");
290 if (DomNm ==
"bloggrevyen.no") {
291 const int Http2 = PostUrlStr.
SearchStr(
"/http://");
297 if (DomNm.
IsSuffix(
".rd.yahoo.com")) {
298 const int Http2 = PostUrlStr.
SearchStr(
"/*");
308 if (
StripEnd(UrlIn,
"/", UrlOut)) {}
309 else if (
StripEnd(UrlIn,
"/index.html", UrlOut)) {}
310 else if (
StripEnd(UrlIn,
"/index.htm", UrlOut)) {}
311 else if (
StripEnd(UrlIn,
"/index.php", UrlOut)) {}
319 if (UrlIn[0] !=
'/') { Out.
AddCh(
'/'); }
324 if (UrlOut.
IsPrefix(
"http://www.")) {
332 const int StrLen = Str.
Len();
333 const int SearchStrLen = SearchStr.
Len();
334 if (StrLen < SearchStrLen) {
return false; }
335 for (
int i = 0; i < SearchStrLen; i++) {
336 if (Str[StrLen-i-1] != SearchStr[SearchStrLen-i-1]) {
return false; }
338 NewStr = Str.
GetSubStr(0, StrLen-SearchStrLen-1);
343 if (LongStr.
Len() < MaxLen) {
return LongStr; }
351 char *b = (
char *) ChA.
CStr();
353 if (*b == 0) {
return TChA(); }
361 OutChA += b; OutChA.
AddCh(
' ');
367 OutChA.DelLastCh(); OutChA.ToLc();
373 char *b = (
char *) ChA.
CStr();
375 if (*b == 0) {
return TChA(); }
380 while (*e &&
TCh::IsWs(*e)) { e++; ws=
true; }
382 if (ws) { OutChA.
AddCh(
' '); ws=
false; }
395 for (
const char *c = CStr; *c; c++) {
406 for (
int w = 0; w < WrdV.
Len(); w++) {
407 if (StopWordH.
IsKey(WrdV[w])) { SWordCnt++; }
409 return WrdV.
Len() - SWordCnt;
415 for (
char *c = (
char *) ChA.
CStr(); *c; c++) {
416 if ((SplitOnWs && *c ==
' ') || (! SplitOnWs && !
TCh::IsAlNum(*c))) {
428 for (
char *c = (
char *) ChA.
CStr(); *c; c++) {
431 if (SkipEmpty && ! WrdV.
Empty() && strlen(WrdV.
Last()) == 0) { WrdV.
DelLast(); }
435 if (SkipEmpty && ! WrdV.
Empty() && strlen(WrdV.
Last()) == 0) { WrdV.
DelLast(); }
443 for (
char *c = (
char *) ChA.
CStr(); *c; c++) {
445 if (c > ChA.
CStr() && *(c-1)==
'\r') { *(c-1)=0; }
448 if (IsChs) { LineV.
Add(c+1); }
462 const char *B = ChA.
CStr();
463 const char *E = B+ChA.
Len();
464 char *c = (
char *) B;
466 if (*c) { SentenceV.
Add(c); }
else {
return 0; }
468 if (c<E && (*c ==
'.' || *c ==
'!' || *c ==
'?') && !
TCh::IsAlNum(*(c+1))) {
469 if (c<E && *(c+1)==
'"') { *c=
'"'; c++; }
470 if (c>=E) {
continue; }
473 while (e>B && *e!=
'"' && !
TCh::IsAlNum(*e)) { *e=0; e--; }
475 if (c<E) { SentenceV.
Add(c); }
478 return SentenceV.
Len();
499 StrB = (
char *) HtmlStr.
CStr();
500 StrE = (
char *) StrB+HtmlStr.
Len();
501 for (
char *e = StrB; e < StrE; ) {
503 while (e<StrE && *e !=
'<') { e++; }
506 TextStr+= b; TextStr.
AddCh(
' '); *e = tmp;
507 if (e >= StrE) {
return; }
509 if (e[1]==
'!' && e[2]==
'-' && e[3]==
'-') {
511 while(e<StrE && !(*(e-2)==
'-' && *(e-1)==
'-' && *e==
'>')) { e++; }
515 if (e[1]==
's' && e[2]==
'c' && e[3]==
'r' && e[4]==
'i' && e[5]==
'p' && e[6]==
't') {
517 while(e<StrE && !(*(e-6)==
's' && *(e-5)==
'c' && *(e-4)==
'r' && *(e-3)==
'i' && *(e-2)==
'p' && *(e-1)==
't' && *e==
'>')) { e++; }
521 while (e < StrE && *e != '>
') { e++; }
522 if (e>=StrE) { return; }
527 bool TStrUtil::IsLatinStr(const TChA& Str, const double& MinAlFrac) {
528 int AlNumCnt=0, ChCnt=0;
529 for (const char *c = Str.CStr(); *c; c++) {
530 if (TCh::IsWs(*c)) { continue; }
531 if (*c > 0 && TCh::IsAlNum(*c)) { AlNumCnt++; }
534 if (double(AlNumCnt)/double(ChCnt) > MinAlFrac) { return true; }
538 void TStrUtil::GetWIdV(const TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) {
539 const int NotWId = -1;
543 TStrUtil::SplitWords(ChA, WrdV);
545 for (int w = 0; w < WrdV.Len(); w++) {
546 if (StrH.IsKeyGetDat(WrdV[w], WId)) { WIdV.Add(WId); }
547 else { WIdV.Add(NotWId); }
551 // and words to StrH and get a vector of word ids
552 void TStrUtil::GetAddWIdV(TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) {
556 TStrUtil::SplitWords(ChA, WrdV);
558 for (int w = 0; w < WrdV.Len(); w++) {
559 WIdV.Add(StrH.AddDatId(WrdV[w]));
563 // Parse time in various formats:
564 // 10:16, 16 Sep 2004
565 // 10:20, 2004 Sep 16
566 // 2005-07-07 20:30:35
567 // 23:24:07, 2005-07-10
569 // 21:16, July 9, 2005
570 // 06:02, 10 July 2005
571 bool TStrUtil::GetTmFromStr(const char* TmStr, TSecTm& Tm) {
572 static TStrV MonthV1, MonthV2;
573 if (MonthV1.Empty()) {
574 TStr("january|february|march|april|may|june|july|august|september|october|november|december").SplitOnAllCh('|
', MonthV1);
575 TStr("jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec").SplitOnAllCh('|
', MonthV2);
580 const char* End = Tmp.CStr()+Tmp.Len();
581 int Col = -1, Cols=0;
582 for (char *b = Tmp.CStr(); b <End; ) {
584 while (*b && ! (*b==' ' || *b=='-
' || *b==':
' || *b==',
')) { b++; }
585 if (*b==':
') { if(Col==-1) { Col=WrdV.Len(); } Cols++; }
587 while (*b && (*b==' ' || *b=='-
' || *b==':
' || *b==',
')) { b++; }
590 if (Col+1 >= WrdV.Len()) { return false; }
593 if (Col<1) { return false; }
594 const int Hr = atoi(WrdV[Col-1]);
595 const int Min = atoi(WrdV[Col]);
596 WrdV.Del(Col); WrdV.Del(Col-1);
597 if (WrdV.Len() != 3) { return false; }
598 int y=0,m=1,d=2, Mon=-1;
599 if (TCh::IsAlpha(WrdV[0][0])) {
601 } else if (TCh::IsAlpha(WrdV[1][0])) {
603 } else if (TCh::IsAlpha(WrdV[2][0])) {
609 int Day = atoi(WrdV[d]);
610 if (Mon <= 0) { Mon = MonthV1.SearchForw(WrdV[m])+1; }
611 if (Mon <= 0) { Mon = MonthV2.SearchForw(WrdV[m])+1; }
612 if (Mon == 0) { return false; }
613 int Year = atoi(WrdV[y]);
614 if (Day > Year) { ::Swap(Day, Year); }
615 //printf("%d-%02d-%02d %02d:%02d\n", Year, Mon, Day, Hr, Min);
616 Tm = TSecTm(Year, Mon, Day, Hr, Min, 0);
620 // Standardize first and lastnames into <last_name>_<first name innitial>
621 TStr TStrUtil::GetStdName(TStr AuthorName) {
624 AuthorName.ChangeChAll('\n
', ' ');
625 AuthorName.ChangeChAll('.
', ' ');
626 // if there is a number in the name, remove it and everything after it
628 while (pos<AuthorName.Len() && (AuthorName[pos]!='#
' && !TCh::IsNum(AuthorName[pos]))) {
630 if (pos < AuthorName.Len()) {
631 AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc(); }
632 if (AuthorName.Empty()) { return TStr::GetNullStr(); }
634 // replace everything after '(
'
635 int b = AuthorName.SearchCh('(
');
637 AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc(); }
638 // skip if contains ')
'
639 if (AuthorName .SearchCh(')
')!=-1) { return TStr::GetNullStr(); }
640 // skip if it is not a name
641 if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1
642 || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) {
643 return TStr::GetNullStr();
645 // remove all non-letters (latex tags, ...)
647 for (i = 0; i < AuthorName.Len(); i++) {
648 const char Ch = AuthorName[i];
649 if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-
') { NewName += Ch; }
651 StdName = NewName; StdName.ToTrunc();
652 TStrV AuthNmV; StdName.SplitOnWs(AuthNmV);
653 // too short -- not a name
654 if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast();
655 if (AuthNmV.Len() < 2) return TStr::GetNullStr();
657 const TStr LastNm = AuthNmV.Last();
658 if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr();
660 IAssert(isalpha(AuthNmV[0][0]));
661 return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]);
664 void TStrUtil::GetStdNameV(TStr AuthorNames, TStrV& StdNameV) {
665 AuthorNames.ChangeChAll('\n
', ' ');
667 // split into author names
668 TStrV AuthV, TmpV, Tmp2V;
670 AuthorNames.SplitOnStr(" and ", TmpV);
672 for (i = 0; i < TmpV.Len(); i++) {
673 TmpV[i].SplitOnAllCh(',
', Tmp2V); AuthV.AddV(Tmp2V); }
675 TmpV = AuthV; AuthV.Clr();
676 for (i = 0; i < TmpV.Len(); i++) {
677 TmpV[i].SplitOnAllCh('&
', Tmp2V); AuthV.AddV(Tmp2V); }
679 TmpV = AuthV; AuthV.Clr();
680 for (i = 0; i < TmpV.Len(); i++) {
681 TmpV[i].SplitOnAllCh(',
', Tmp2V); AuthV.AddV(Tmp2V); }
683 TmpV = AuthV; AuthV.Clr();
684 for (i = 0; i < TmpV.Len(); i++) {
685 TmpV[i].SplitOnAllCh(';
', Tmp2V); AuthV.AddV(Tmp2V); }
688 //printf("\n*** %s\n", AuthorNames.CStr());
689 for (i = 0; i < AuthV.Len(); i++) {
690 TStr StdName = GetStdName(AuthV[i]);
691 if (! StdName.Empty()) {
692 //printf("\t%s ==> %s\n", AuthV[i].CStr(), StdName.CStr());
693 StdNameV.Add(StdName);
701 double TStopwatch::Tick() {
703 //return clock() / ((double)CLOCKS_PER_SEC);
707 return omp_get_wtime();
713 return GetTickCount() / 1000.0;
717 struct rusage rusage;
719 getrusage(RUSAGE_SELF, &rusage);
725 ((float) (rusage.ru_utime.tv_usec + rusage.ru_stime.tv_usec) / 1000000) +
727 ((float) (rusage.ru_utime.tv_sec + rusage.ru_stime.tv_sec));
733 void TStopwatch::Start(const TExperiment Exp) {
734 Starts[Exp] = Tick();
737 void TStopwatch::Stop(const TExperiment Exp) {
738 double Duration = Tick() - Starts[Exp];
739 Sums[Exp] += Duration;
740 Maxs[Exp] = Maxs[Exp] >= Duration ? Maxs[Exp] : Duration;
741 Mins[Exp] = Mins[Exp] <= Duration ? Mins[Exp] : Duration;
745 int TStopwatch::Cnt(const TExperiment Exp) const {
749 double TStopwatch::Sum(const TExperiment Exp) const {
753 double TStopwatch::Avg(const TExperiment Exp) const {
754 return Sums[Exp] / Cnts[Exp];
757 double TStopwatch::Max(const TExperiment Exp) const {
761 double TStopwatch::Min(const TExperiment Exp) const {
768 #if defined(SW_WRITEN)
769 int WriteN(int fd, char *ptr, int nbytes) {
775 nwritten = (int) write(fd, ptr, nleft);
782 return (nbytes-nleft);
static TChA GetDomNm(const TChA &UrlChA)
static bool GetNormalizedUrl(const TChA &UrlIn, const TChA &BaseUrl, TChA &UrlOut)
Quick URL nomalization: Remove ending /, /index.html, etc. and strip starting www.
static void MakeExpBins(const TFltPrV &XYValV, TFltPrV &ExpXYValV, const double &BinFactor=2, const double &MinYVal=1)
static TChA GetWebsiteNm(const TChA &UrlChA)
static TChA GetDomNm2(const TChA &UrlChA)
void AddCh(const char &Ch, const int &MxLen=-1)
TSizeTy Len() const
Returns the number of elements in the vector.
static int SplitSentences(TChA &ChA, TVec< char * > &SentenceV)
static void GetXmlTagNmVal(TXmlLx &XmlLx, TChA &TagNm, TChA &TagVal)
static void MakeExpBins(const TFltPrV &XYValV, TFltPrV &ExpXYValV, const double &BinFactor=2, const double &MinYVal=1)
int SearchStr(const TChA &Str, const int &BChN=0) const
static void RemoveHtmlTags(const TChA &HtmlStr, TChA &TextStr)
static bool GetXmlTagNmVal2(TXmlLx &XmlLx, TChA &TagNm, TChA &TagVal, const bool &TakeTagNms)
bool Empty() const
Tests whether the vector is empty.
static void GetPdf(const TIntPrV &CdfV, TIntPrV &PdfV)
static TChA GetShorStr(const TChA &LongStr, const int MaxLen=50)
int SearchChBack(const char &Ch, int BChN=-1) const
static bool IsWs(const char &Ch)
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
static int SplitLines(TChA &ChA, TVec< char * > &LineV, const bool &SkipEmpty=false)
bool IsKey(const char *Key) const
bool IsPrefix(const char *CStr, const int &BChN=0) const
static int CountWords(const char *CStr)
static int SplitOnCh(TChA &ChA, TVec< char * > &WrdV, const char &Ch, const bool &SkipEmpty=false)
static int SplitWords(TChA &ChA, TVec< char * > &WrdV, const bool &SplitOnWs=true)
static TChA GetCleanWrdStr(const TChA &ChA)
TChA GetSubStr(const int &BChN, const int &EChN) const
const TVal & Last() const
Returns a reference to the last element of the vector.
static TChA & GetXmlTagVal(TXmlLx &XmlLx, const TChA &TagNm)
static void GetCdf(const TIntPrV &PdfV, TIntPrV &CdfV)
static void GetCCdf(const TIntPrV &PdfV, TIntPrV &CCdfV)
static bool IsAlNum(const char &Ch)
int SearchCh(const char &Ch, const int &BChN=0) const
bool IsSuffix(const char *CStr) const
static void Normalize(TFltPrV &PdfV)
#define EAssertR(Cond, MsgStr)
int GetNthOccurence(const TChA &Url, const int &Count, const char Ch='/')
static TChA GetCleanStr(const TChA &ChA)
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
void DelLast()
Removes the last element of the vector.
static bool StripEnd(const TChA &Str, const TChA &SearchStr, TChA &NewStr)