-
Notifications
You must be signed in to change notification settings - Fork 444
/
Copy pathAddress.cpp
20894 lines (19310 loc) · 609 KB
/
Address.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//-*- coding: utf-8 -*-
#include "Proxy.h"
class Address *g_address; // for debug
#define CRID_ANY 0
#define CRID_US 226
//
// if you have "in <city/adm1 name>" in same sentence as street then
// require that that item be a city/adm1 in any address you try to do.
// i would set "int64_t inPrepPhrase" to be the city/adm1 place hash.
// so if it is not zero, check for it. but add it with addProperPlaces()
// first to see if it added anything!! then we can
//
//and fix it so "1914" years and older years are pub dates!
//and inclide days of the week in pub dates like "sunday, april 11, 2004"
//too!!
//do not allow lower case 'or' in place name!
//do not allow place names starting with "arrangements by" or "sponsored by"
// test on http://alibi.com/index.php?scn=cal
// test on http://www.burtstikilounge.com/burts/
// TODO: FOR ADDRESS overlap detection, just hash every word index for
// every Place which can not be shared. then store the score and
// Address ptr as the data value, so we can do a quick compare!
// TODO: also add conflicting addresses with the same score as winners.
// if we can't resolve a winner then we should just eliminate both/all
// to be on the safe side. like the alibi.com page has both albuquerque
// and santa fe in the <title> tag so it is really just lucky that we
// pick albuquerque most of the time... we might be able to bring in
// street name to city map to help us fix this one. if both cities have
// the same street name, then nuke both! any other ideas?
// TODO: for the abqjournal.com page we need to determine the most popular
// city/adm1 pair over the whole page and use that as another default
// option. also consider if we should have several and score them...
// TODO: for all the phrases in "small" sections and all phrases following
// "at" or "at the" look those phrases up in placedb as place names
// to get their addresses. also confirm the place names we extract
// that are immediately before street names. also get all the possible
// city/adm1/ctry tuples that each place name might have. if these
// are not right next to it then i guess you need to get them from
// the title and tagdb. that way the placedb lookup can integrate
// the tuples into the key and greatly narrow the list. we may have
// to then do multiple lookups for the same place name in placedb,
// so another reason we should distribute them and keep them in memory
// or at least on an SSD. use *namedb* to index place names just like
// indexdb. then we can conduct a search for a place name on namedb
// and get the corresponding keys of the place records in placedb.
// namedb will need to be mostly in memory then!
// TODO: verify street addresses we do extract by looking up each one in
// placedb by the street. each street may have multiple city/adm1/ctry
// tuples, so this lookup should narrow it down!
// test zipcode hyphen fix on abqjournal.com/contact.html
#include "gb-include.h"
#include "Address.h"
#include "Sections.h"
//#include "DateParse2.h"
#include "Abbreviations.h"
#include "Phrases.h"
//#include "Weights.h"
#include "XmlDoc.h" // hashWords()
#include "Hostdb.h"
#include "Placedb.h"
#include "sort.h"
#include "HttpServer.h"
//#define CF_UNIQUE (((uint64_t)1LL)<<63)
bool getBestLatLon ( RdbList *list ,
double *bestLat ,
double *bestLon ,
int32_t *numVotes ,
int32_t niceness ,
int32_t winnerSnh ) ;
char *getLatLonPtrFromStr ( char *data ) ;
void getLatLonFromStr ( char *data , double *lat , double *lon);
char *getStateAbbr ( uint64_t bit ) ;
int64_t getWordXorHash ( char *s ) ;
int64_t getWordXorHash2 ( char *s ) ;
int32_t getStateOffset ( int64_t *h ) ;
class StateDesc *getStateDescFromBits ( uint64_t bit ) ;
// returns 0 if not a state:
uint64_t getStateBitFromHash ( int64_t *h ) ;
static bool setHashes ( class Place *p , Words *ww , int32_t niceness ) ;
static bool addIndicator ( char *s , char bit , float boost );
static bool addIndicator ( int64_t h , char bit , float boost );
//static void printAddress ( class Address *A , class SafeBuf *pbuf , int32_t i);
static void printPlaces ( PlaceMem *pm , SafeBuf *pbuf ,
class Sections *sections ,
class Address *base ) ;
static bool getZipLatLon ( char *zip ,
int32_t zipLen ,
float *zipLat ,
float *zipLon ) ;
//
// new stuff
//
static bool generatePlacesFile ( ) ;
static bool loadPlaces ( ) ;
class PlaceDesc *getState_new ( uint64_t pd64 , uint8_t crid , int32_t niceness );
PlaceDesc *getState2_new ( char *state , uint8_t crid , int32_t niceness ) ;
class PlaceDesc *getCity_new ( uint64_t ch64 ,
char *stateAbbr ,
uint8_t crid ,
int32_t niceness ) ;
class PlaceDesc *getCity2_new ( char *city ,
char *stateAbbr ,
uint8_t crid ,
int32_t niceness ) ;
PlaceDesc *getCity3_new ( uint64_t ch64 ,
uint64_t stateHash64,
uint8_t crid ,
int32_t niceness ) ;
bool getLongestPlaceName_new ( int32_t i,
int32_t alnumPos,
Words *w,
// must match! PDF_CITY|STATE|COUNTRY
uint8_t placeType,
uint8_t crid, // can be CRID_ANY
char *stateAbbr, // can be NULL
uint64_t *placeHash64,
int32_t *placeAlnumA,
int32_t *placeAlnumB,
int32_t *placeA,
int32_t *placeB ,
// set to most popular match
PlaceDesc **pdp ) ;
bool getZip_new ( int32_t a ,
int32_t alnumPos ,
Words *words ,
uint64_t *zipHash64 ,
uint64_t *zipCityHash64 ,
uint64_t *zipStateHash64 ,
int32_t *zipAlnumA,
int32_t *zipAlnumB,
int32_t *zipA,
int32_t *zipB ,
float *zipLat,
float *zipLon) ;
PlaceDesc *getMostPopularPlace_new ( int64_t cityHash64,
uint8_t crid ,
uint8_t placeType,
int32_t niceness );
char *g_pbuf = NULL;
int32_t g_pbufSize = 0;
HashTableX g_nameTable;
char *PlaceDesc::getOfficialName ( ) {
return g_pbuf + m_officialNameOffset;
}
char *PlaceDesc::getStateName ( ) {
// get our state abbr
char buf[3];
buf[0] = m_adm1[0];
buf[1] = m_adm1[1];
buf[2] = '\0';
// does this convert to lowercase? yes... it should
uint64_t placeHash64 = getWordXorHash ( buf );
// look up the place desc for the state
PlaceDesc *sd = getPlaceDesc ( placeHash64 ,
PDF_STATE,
m_crid,
buf, // state abbr
0 ); // niceness
if ( ! sd ) return NULL;
return sd->getOfficialName();
}
const char *PlaceDesc::getCountryName ( ) {
return g_countryCode.getName ( m_crid );
}
HashTableX g_indicators;
static HashTableX g_timeZones;
static HashTableX g_cities;
static HashTableX g_states;
static HashTableX g_aliases;
static HashTableX g_zips;
char *g_cityBuf = NULL;
int32_t g_cityBufSize = 0;
// . NOW each slot in the g_cities has a ptr to a CityDesc in SafeBuf g_cityBuf
// . so now we can put all the alternate names and aliases into the same table
class CityDesc {
public:
// set bit for each state that the city is in
uint64_t m_adm1Bits;
// for chicago, we would pick "13" since s_states[13] is illinois
char m_mostPopularState;
// "us.nm,us.ny,es.a1,...|en-nl-fi=cincinnati,es-de=cincinnatus,..."
char m_data[];
};
//bool setFromStr(Address *a,char *s,pbits_t flags ,
// Place *places , int32_t *np , int32_t maxPlaces, int32_t niceness );
static uint64_t getAddressHash ( Place *street ,
Place *city ,
Place *adm1 ,
Place *zip ) ;
static void verifiedWrapper ( void *state ) ;
static void gotMsg2cReplyWrapper ( void *state , void *state2 ) ;
static void gotList2c ( void *state , RdbList *xxx , Msg5 *yyy ) ;
static void sendBackAddress ( class State2c *st ) ;
Place *g_pa = NULL;
#define MIN_POP_COUNT 500
//#define MAX_STREETS 300
//#define MAX_PLACES 3500
// i raised from 15 to 25 since "Virginia Beach" city was not being picked up
// on socialmediabeach.com
#define MAX_CITIES 25
#define MAX_ADM1 80 // 1500
#define MAX_ZIPS 5
// stock g_zips with these zip code descriptors
class ZipDesc {
public:
// . this is unique within the country code only
// . see /gb/geo/geonames/admin1Codes.txt for the list
// . remove the "CC." country code prefixing each
// . example from that file: "NL.09 Utrecht\n"
char m_adm1[2];
// a single byte country id (converted to from a 2 char country id)
//uint8_t m_crid;
// hash of the city it is in
int64_t m_cityHash;
// offset into g_cityBuf of the city name
int32_t m_cityOffset;
// now we use the adm1 bits since US-only now
uint64_t m_adm1Bits;
// lat/lon of centroid. for sorting by dist when user's zip is known
float m_latitude;
float m_longitude;
//void reset() {m_crid = 0; m_adm1[0] = m_adm1[1] = 0;};
void reset() {m_adm1Bits = 0;m_adm1[0]=0; m_adm1[1]=0;};
};
static char *s_days[] = {
"sunday",
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sundays",
"mondays",
"tuesdays",
"wednesdays",
"thursdays",
"fridays",
"saturdays",
NULL
};
static StateDesc s_states[] = {
{"al","alabama","ala"},
{"ak","alaska","alas"},
{"az","arizona","ariz"},
{"ar","arkansas","ark"},
{"ca","california","calif"},
{"co","colorado","colo"},
{"ct","connecticut","conn"},
{"de","delaware","del"},
{"dc","district of columbia","d.c."},
{"fl","florida","fla"},
{"ga","georgia",NULL},
{"hi","hawaii","h.i."},
{"id","idaho","ida"},
{"il","illinois","ill"},
{"in","indiana","ind"},
{"ia","iowa",NULL},
{"ks","kansas","kan"},
{"ky","kentucky",NULL},
{"la","louisiana",NULL},
{"me","maine",NULL},
{"md","maryland",NULL},
{"ma","massachusetts","mass"},
{"mi","michigan","mich"},
{"mn","minnesota","minn"},
{"ms","mississippi","miss"},
{"mo","missouri",NULL},
{"mt","montana","mont"},
{"ne","nebraska","nebr"},
{"nv","nevada","nev"},
{"nh","new hampshire","n.h."},
{"nj","new jersey","n.j."},
{"nm","new mexico","n.m."},
{"ny","new york","n.y."},
{"nc","north carolina","n.c."},
{"nd","north dakota","n.d."},
{"oh","ohio",NULL},
{"ok","oklahoma","okla"},
{"or","oregon","ore"},
{"pa","pennsylvania","penn"},
{"ri","rhode island","r.i."},
{"sc","south carolina","s.c."},
{"sd","south dakota","s.d."},
{"tn","tennessee","tenn"},
{"tx","texas","tex"},
{"ut","utah",NULL},
{"vt","vermont",NULL},
{"va","virginia","virg"},
{"wa","washington","wash"},
{"wv","west virginia","w.v."},
{"wi","wisconsin","wis"},
{"wy","wyoming","wyo"}
};
#include "StopWords.h"
static HashTableX s_doyTable;
static bool s_doyInit = false;
int32_t getDayOfWeek ( int64_t h ) {
if ( ! s_doyInit ) {
s_doyInit = initWordTable(&s_doyTable, s_days ,
//sizeof(s_days),
"doytbl");
if ( ! s_doyInit ) return -1;
}
// . get from table
// . score should be 1 for sunday i guess
int32_t score = s_doyTable.getScore ( &h );
// make it 0-6
score = (score-1) % 7;
// that's it
return score;
}
// http://www.dailylobo.com/calendar/
// http://www.abqthemag.com/events.html
// http://www.abqjournal.com/calendar/default.php
// http://www.abqjournal.com/calendar/month.htm (243k! do not truncate!!)
// http://www.kasa.com/subindex/entertainment/events_calendar
// http://www.trumba.com/calendars/KRQE_Calendar.rss (rss)
// http://www.koat.com/calendar/index.html
// http://www.trumba.com/calendars/albuquerque-area-events-calendar.rss.
// http://www.google.com/calendar/embed?mode=AGENDA&height=700&wkst=1&bgcolor=%23FFFFFF&src=vn90mq4n30kodohqjv8cdn5cfg%40group.calendar.google.com&color=%237A367A
// http://www.krqe.com/subindex/features/events_calendar
// http://www.alibi.com/index.php?scn=cal
// http://www.publicbroadcasting.net/kunm/events.eventsmain
// http://www.publicbroadcasting.net/kunm/events.eventsmain?action=showCategoryListing&newSearch=true&categorySearch=4025
// http://www.770kob.com/article.asp?id=521586
// http://events.kgoradio.com/
// http://www.livenation.com/venue/journal-pavilion-tickets (journal pavilion)
// http://www.livenation.com/venue/kiva-auditorium-tickets
// http://events.kqed.org/events/
// http://www.sfbg.com/entry.php?entry_id=8401&catid=85&l=1
// http://events.sfgate.com/ (zvents.com)
// http://events.sfgate.com/search?cat=1
// http://entertainment.signonsandiego.com/search/?type=event
// http://www.sdcitybeat.com/cms/event/search/?menu=Events
// ** http://www.sandiegometro.com/calendar/arts.php
// address parsing test cases:
// http://yellowpages.superpages.com/listings.jsp?CS=L&MCBP=true&search=Find+It&SRC=&C=bicycles&STYPE=S&L=Albuquerque+NM+&x=0&y=0
// address examples:
// BRAZIL:
// Marina Costa e Silva
// Rua Afonso Canargo, 805
// Santana
// 85070-200 Guarapuava - PR
// University of New Mexico
// Department of Physics and Astronomy
// MSC07 4220
// 800 Yale Blvd NE
// Albuquerque, New Mexico 87131-0001 USA
// US-380
// Lincoln, NM
// Saturday, August 8, 2009
static bool s_init = false;
Addresses::Addresses ( ) {
m_buf = NULL;
m_bufSize = 0;
m_calledGeocoder = false;
m_xd = NULL;
m_msg2c = NULL;
m_sorted = NULL;
m_sortedValid = false;
m_breached = false;
m_numValid = 0;
}
Addresses::~Addresses ( ) {
reset();
}
void Addresses::reset ( ) {
if ( m_buf && m_bufSize )
mfree ( m_buf , m_bufSize , "adata");
m_buf = NULL;
m_bufSize = 0;
m_sb.purge();
//m_ptValid = false;
//m_msg2c.m_requests = 0;
//m_msg2c.m_replies = 0;
m_firstBreach = true;
m_breached = false;
m_numValid = 0;
m_calledGeocoder = false;
if ( m_msg2c ) {
mdelete ( m_msg2c , sizeof(Msg2c),"aamsg2c");
delete (m_msg2c);
m_msg2c = NULL;
}
// free buf
if ( m_sorted )
mfree ( m_sorted , m_sortedSize , "asortbuf");
m_sorted = NULL;
m_sortedValid = false;
m_uniqueStreetHashes = 0;
}
static int64_t h_court;
static int64_t h_i;
static int64_t h_interstate;
static int64_t h_page ;
static int64_t h_corner ;
static int64_t h_between ;
static int64_t h_btwn ;
static int64_t h_bet ;
static int64_t h_streets ;
static int64_t h_sts ;
static int64_t h_at ;
static int64_t h_come ;
static int64_t h_is ;
static int64_t h_located ;
static int64_t h_intersection;
static int64_t h_law ;
static int64_t h_address ;
static int64_t h_added ;
static int64_t h_copy ;
static int64_t h_search ;
static int64_t h_find ;
static int64_t h_go ;
static int64_t h_town ;
static int64_t h_city ;
static int64_t h_street ;
static int64_t h_telephone;
static int64_t h_tel ;
static int64_t h_ph ;
static int64_t h_fax ;
static int64_t h_where ;
static int64_t h_location;
static int64_t h_venue ;
static int64_t h_map ;
static int64_t h_office ;
static int64_t h_center ;
static int64_t h_mailing ;
static int64_t h_mail ;
static int64_t h_snail ;
static int64_t h_edit ;
static int64_t h_email ;
static int64_t h_phone ;
static int64_t h_inc ;
static int64_t h_llc ;
static int64_t h_review ;
static int64_t h_reviews ;
static int64_t h_write ;
static int64_t h_add ;
static int64_t h_view ;
static int64_t h_favorites ;
static int64_t h_more ;
static int64_t h_info ;
static int64_t h_information ;
static int64_t h_the ;
static int64_t h_in ;
static int64_t h_a ;
static int64_t h_paseo ;
static int64_t h_de ;
static int64_t h_del ;
static int64_t h_all ;
static int64_t h_rights ;
static int64_t h_reserved ;
static int64_t h_contact ;
static int64_t h_us ;
static int64_t h_by ;
static int64_t h_of ;
static int64_t h_for ;
static int64_t h_arrangements ;
static int64_t h_arranged ;
static int64_t h_sponsored ;
static int64_t h_to ;
static int64_t h_every ;
static int64_t h_p ;
static int64_t h_b ;
static int64_t h_hwy ;
static int64_t h_state ;
static int64_t h_county ;
static int64_t h_cnty ;
static int64_t h_cty ;
static int64_t h_road ;
static int64_t h_route ;
static int64_t h_rte ;
static int64_t h_rt ;
static int64_t h_highway ;
static int64_t h_hiway ;
static int64_t h_cr ;
static int64_t h_o ;
static int64_t h_po ;
static int64_t h_post ;
static int64_t h_box ;
static int64_t h_top ;
static int64_t h_one ;
static int64_t h_noon ;
static int64_t h_midnight ;
static int64_t h_daily ;
static int64_t h_st ;
static int64_t h_nd ;
static int64_t h_rd ;
static int64_t h_th ;
static int64_t h_away ;
static int64_t h_results ;
static int64_t h_days ;
static int64_t h_blocks ;
static int64_t h_block ;
static int64_t h_miles ;
static int64_t h_mile ;
static int64_t h_year ;
static int64_t h_years ;
static int64_t h_yr ;
static int64_t h_yrs ;
static int64_t h_hours ;
static int64_t h_hrs ;
static int64_t h_hour ;
static int64_t h_hr ;
static int64_t h_mi ;
static int64_t h_kilometers;
static int64_t h_km ;
static int64_t h_copyright ;
static int64_t h_and ;
static int64_t h_or ;
static int64_t h_suite ;
static int64_t h_ste ;
static int64_t h_bldg ;
static int64_t h_bld ;
static int64_t h_building ;
static int64_t h_unit ;
static int64_t h_room ;
static int64_t h_pier ;
static int64_t h_rm ;
static int64_t h_run ;
static int64_t h_ne ;
static int64_t h_nw ;
static int64_t h_se ;
static int64_t h_sw ;
static int64_t h_n ;
static int64_t h_s ;
static int64_t h_e ;
static int64_t h_w ;
static int64_t h_north;
static int64_t h_northeast;
static int64_t h_northwest;
static int64_t h_east;
static int64_t h_west;
static int64_t h_south;
static int64_t h_southeast;
static int64_t h_southwest;
static int64_t h_heart ;
static int64_t h_core ;
static int64_t h_least ;
static int64_t h_most ;
static int64_t h_this ;
static int64_t h_appeared ;
static int64_t h_role ;
static int64_t h_studied;
static int64_t h_prize;
static int64_t h_finish;
static int64_t h_door;
static int64_t h_entrance;
static int64_t h_area;
static int64_t h_left ;
static int64_t h_right ;
static int64_t h_stare ;
static int64_t h_sea ;
static int64_t h_discount ;
static int64_t h_discounted ;
static int64_t h_www;
static int64_t h_gaze ;
static int64_t h_look ;
static int64_t h_looking;
static int64_t h_be ;
static int64_t h_determined ;
static int64_t h_call ;
static int64_t h_details;
static int64_t h_tba;
static int64_t h_avenue;
static int64_t h_ave;
static int64_t h_register;
static int64_t h_sign;
static int64_t h_up;
static int64_t h_signup;
static int64_t h_tickets;
static int64_t h_purchase;
static int64_t h_get;
static int64_t h_enroll;
static int64_t h_buy;
static int64_t h_presale ;
static int64_t h_pre ;
static int64_t h_sale ;
static int64_t h_on ;
static int64_t h_sales ;
static int64_t h_end ;
static int64_t h_begin ;
static int64_t h_start ;
static int64_t h_am;
static int64_t h_fm;
// . first identifies all the "Places" using the rules above
// . then clusters the "Places" together into an "Address"
// . we use the address at the top of the page, and the site contact info,
// etc. to be defaults, so we can inherit, city, state, etc. from those
// . returns false if blocked, true otherwise. sets g_errno on error.
bool Addresses::set ( Sections *sections ,
Words *words ,
Bits *bits ,
TagRec *gr ,
Url *url ,
int64_t docId ,
//char *coll ,
collnum_t collnum ,
int32_t domHash32 ,
int32_t ip ,
//int32_t tagPairHash ,
int32_t niceness ,
SafeBuf *pbuf ,
void *state ,
void (*callback) (void *state) ,
uint8_t contentType ,
// from XmlDoc::ptr_addressReply in a title rec
//char *addressReply ,
//int32_t addressReplySize ,
//bool addressReplyValid ,
char *siteTitleBuf ,
int32_t siteTitleBufSize ,
XmlDoc *xd ) {
reset();
// save stuff
m_xd = xd;
m_sections = sections;
m_words = words;
m_wptrs = words->m_words;
m_wlens = words->m_wordLens;
m_nw = words->m_numWords;
m_wids = words->getWordIds();
m_tids = words->getTagIds();
m_bits = bits;
m_gr = gr;
m_url = url;
m_docId = docId;
m_collnum = collnum;
m_domHash32 = domHash32;
m_ip = ip;
//m_tagPairHash = tagPairHash;
m_niceness = niceness;
m_pbuf = pbuf;
m_state = state;
m_callback = callback;
m_contentType = contentType;
//m_addressReply = addressReply;
//m_addressReplySize = addressReplySize;
//m_addressReplyValid = addressReplyValid;
m_siteTitleBuf = siteTitleBuf;
m_siteTitleBufSize = siteTitleBufSize;
m_sb.purge();
static bool s_setHashes = false;
if ( ! s_setHashes ) {
// flag it
s_setHashes = true;
// int16_tcuts
h_i = hash64n ("i");
h_court = hash64n ("court");
h_interstate = hash64n ("interstate");
h_page = hash64n ("page");
h_corner = hash64n ("corner");
h_between = hash64n ( "between");
h_btwn = hash64n ( "btwn");
h_bet = hash64n ( "bet");
h_streets = hash64n ( "streets");
h_sts = hash64n ( "sts");
h_at = hash64n ( "at" );
h_come = hash64n ("come");
h_is = hash64n ( "is" );
h_located = hash64n ( "located" );
h_intersection = hash64n("intersection");
h_law = hash64 ( "law" ,3);
h_address = hash64 ( "address",7);
h_added = hash64 ( "added",5);
h_copy = hash64 ( "copy",4);
h_search = hash64 ( "search",6);
h_find = hash64 ( "find",4);
h_go = hash64 ( "go",2);
h_town = hash64n ( "town");
h_city = hash64n ( "city");
h_street = hash64 ( "street",6);
h_telephone = hash64 ( "telephone",9);
h_tel = hash64 ( "tel",3);
h_ph = hash64 ( "ph",2);
h_fax = hash64 ( "fax",3);
h_where = hash64 ( "where",5);
h_location= hash64 ( "location",8);
h_venue = hash64n("venue");
h_map = hash64 ( "map" ,3);
h_office = hash64 ( "office" ,6);
h_center = hash64n ("center");
h_mailing = hash64 ( "mailing" ,7);
h_mail = hash64 ( "mail" ,4);
h_snail = hash64 ( "snail" ,5);
h_edit = hash64 ( "edit" ,4);
h_email = hash64 ( "email" ,5);
h_phone = hash64 ( "phone" ,5);
h_inc = hash64 ( "inc" ,3);
h_llc = hash64 ( "llc" ,3);
h_review = hash64 ( "review" ,6);
h_reviews = hash64 ( "reviews" ,7);
h_write = hash64 ( "write", 5);
h_add = hash64 ( "add",3 );
h_view = hash64 ( "view", 4);
h_favorites = hash64 ( "favorites", 9);
h_more = hash64 ( "more",4 );
h_info = hash64 ( "info",4 );
h_information = hash64 ( "information", 11);
h_the = hash64 ( "the" ,3);
h_in = hash64 ( "in" ,2);
h_a = hash64 ( "a" ,1);
h_paseo = hash64n ( "paseo");
h_de = hash64n ( "de");
h_del = hash64n ( "del");
h_all = hash64 ( "all" ,3);
h_rights = hash64 ( "rights" ,6);
h_reserved = hash64 ( "reserved" ,8);
h_contact = hash64 ( "contact" , 7);
h_us = hash64 ( "us" , 2);
h_by = hash64 ( "by" ,2);
h_of = hash64 ( "of" ,2);
h_for = hash64 ( "for" ,3);
h_arrangements = hash64("arrangements",12);
h_arranged = hash64("arranged",8);
h_sponsored = hash64("sponsored",9);
h_to = hash64 ( "to" ,2);
h_every = hash64 ( "every",5);
h_p = hash64 ( "p" ,1);
h_b = hash64n ( "b" );
h_hwy = hash64 ( "hwy" ,3);
h_state = hash64 ( "state" ,5);
h_county = hash64 ( "county" , 6 );
h_cnty = hash64 ( "cnty" , 4 );
h_cty = hash64 ( "cty" , 3 );
h_road = hash64 ( "road" ,4);
h_route = hash64 ( "route" ,5);
h_rte = hash64 ( "rte" ,3);
h_rt = hash64 ( "rt" ,2);
h_highway = hash64 ( "highway" ,7);
h_hiway = hash64 ( "hiway" ,5);
h_cr = hash64 ( "cr" ,2);
h_o = hash64 ( "o" ,1);
h_po = hash64 ( "po" ,2);
h_post = hash64 ( "post" ,4);
h_box = hash64 ( "box" ,3);
h_top = hash64n ( "top" );
h_one = hash64 ( "one" ,3);
h_noon = hash64n ( "noon" );
h_midnight = hash64n ( "midnight" );
h_daily = hash64n ( "daily" );
h_st = hash64 ( "st" ,2);
h_nd = hash64 ( "nd" ,2);
h_rd = hash64 ( "rd" ,2);
h_th = hash64 ( "th" ,2);
h_away = hash64 ( "away" ,4);
h_results = hash64 ( "results" , 7 );
h_days = hash64 ( "days", 4 );
h_blocks = hash64 ( "blocks",6);
h_block = hash64 ( "block",5);
h_miles = hash64 ( "miles",5);
h_mile = hash64n ( "mile");
h_year = hash64n("year");
h_years = hash64n("years");
h_yr = hash64n("yr");
h_yrs = hash64n("yrs");
h_hours = hash64 ( "hours",5);
h_hrs = hash64 ( "hrs",3);
h_hour = hash64n ( "hour");
h_hr = hash64n ( "hr");
h_mi = hash64 ( "mi",2);
h_kilometers= hash64 ( "kilometers",10);
h_km = hash64 ( "km",2);
h_copyright = hash64 ( "copyright",9);
h_and = hash64 ( "and" , 3 );
h_or = hash64 ( "or" , 2 );
h_suite = hash64 ( "suite",5);
h_ste = hash64 ( "ste",3);
h_bldg = hash64 ( "bldg",4);
h_bld = hash64n ( "bld");
h_building = hash64 ( "building",8);
h_unit = hash64 ( "unit",4);
h_room = hash64 ( "room",4);
h_pier = hash64 ( "pier",4);
h_rm = hash64 ( "rm",2);
h_run = hash64n ("run");
h_ne = hash64 ( "ne" ,2);
h_nw = hash64 ( "nw" ,2);
h_se = hash64 ( "se" ,2);
h_sw = hash64 ( "sw" ,2);
h_n = hash64 ( "n" ,1);
h_s = hash64 ( "s" ,1);
h_e = hash64 ( "e" ,1);
h_w = hash64 ( "w" ,1);
h_north = hash64n("north");
h_south = hash64n("south");
h_east = hash64n("east");
h_west = hash64n("west");
h_northeast = hash64n("northeast");
h_northwest = hash64n("northwest");
h_southeast = hash64n("southeast");
h_southwest = hash64n("southwest");
h_heart = hash64n ( "heart" );
h_core = hash64n ( "core" );
h_least = hash64n ( "least" );
h_most = hash64n ( "most" );
h_this = hash64n ( "this" );
h_north = hash64n ( "north" );
h_south = hash64n ( "south" );
h_east = hash64n ( "east" );
h_west = hash64n ( "west" );
h_appeared = hash64n ( "appeared" );
h_role = hash64n ( "role" );
h_studied = hash64n ( "studied" );
h_prize = hash64n ( "prize" );
h_finish = hash64n("finish");
h_door = hash64n("door");
h_entrance = hash64n("entrance");
h_area = hash64n("area");
h_left = hash64n ( "left" );
h_right = hash64n ( "right" );
h_stare = hash64n ( "stare" );
h_sea = hash64n ( "sea" );
h_discount = hash64n("discount");
h_discounted = hash64n("discounted");
h_www = hash64n("www");
h_gaze = hash64n ( "gaze" );
h_look = hash64n ( "look" );
h_looking = hash64n ( "looking" );
h_be = hash64n("be");
h_determined = hash64n("determined");
h_call = hash64n("call");
h_details = hash64n("details");
h_tba = hash64n("tba");
h_avenue = hash64n("avenue");
h_ave = hash64n("ave");
h_register = hash64n("register");
h_sign = hash64n("sign");
h_up = hash64n("up");
h_signup = hash64n("signup");
h_tickets = hash64n("tickets");
h_purchase = hash64n("purchase");
h_get = hash64n("get");
h_enroll = hash64n("enroll");
h_buy = hash64n("buy");
h_presale = hash64n("presale");
h_pre = hash64n("pre");
h_sale = hash64n("sale");
h_on = hash64n("on");
h_sales = hash64n("sales");
h_end = hash64n("end");
h_begin = hash64n("begin");
h_start = hash64n("start");
h_am = hash64n("am");
h_fm = hash64n("fm");
}
//m_msg2c.m_mcast.reset();
// sanity check -- did set2() corrupt our junk?
//if ( m_msg2c.m_mcast.m_ownMsg && m_msg2c.m_mcast.m_msgSize > 5000 ){
// char *xx=NULL;*xx=0; }
// returns false and sets g_errno on error
bool status = set2 ( );
// sanity check -- did set2() corrupt our junk?
//if ( m_msg2c.m_mcast.m_ownMsg && m_msg2c.m_mcast.m_msgSize > 5000 ){
// char *xx=NULL;*xx=0; }
// sanity check
if ( ! status && ! g_errno ) { char *xx=NULL;*xx=0; }
// return true on error now
if ( ! status ) return true;
// . ok, go no further if from msg13
// . it will have to check m_good or something, not m_valid
if ( ! m_sections ) return true;
// if valid and empty, we are done
//if ( m_addressReplyValid && ! m_addressReply ) return true;
/*
-- mdw took this out because it had too many false positives. often
the place name 1 and/or 2 was wrong and was calling nonsense a
place! for many urls... and now that i removed the
SEC_CONTENDED_ADDRESS algo all the events on a page even if
different tag hashes, can share the same address. to replace
that algo i am ignore events with SEC_TITLE_OUTLINKED if the
event title is an outlink to another page, and also i am trying
to identify all place names in events. this outlinked bit should
fix the http://www.zvents.com/albuquerque-nm/events/show/88543421-the-love-song-of-j-robert-oppenheimer-by-carson-kreitzer url, since it has a
little section that has "You may Also Like..." for events at
different venues, mentioned by name.
//
// . SELF-VERIFICATION LOOPS
//
// . now use the addresses that were inlined to verify those
// that were not inlined, assuming the place name matches
// . this will allow "The Filling Station" to be verified in
// http://www.zvents.com/albuquerque-nm/events/show/
// 88543421-the-love-song-of-j-robert-oppenheimer-by-carson-kreitzer
// . first scan the addresses for inlined ones
// . logic taken basically from hashForPlacedb()
//
// init the table
HashTableX pt;
// returns true with g_errno set on error
if ( ! pt.set ( 8,4,256,NULL,0,false,m_niceness) ) return true;
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// get it
Address *a = &m_addresses[i];
// must be inlined
if ( ! ( a->m_flags & AF_INLINED ) ) continue;
// sometimes a street can exist in two cities or states
if ( a->m_flags & AF_AMBIGUOUS ) continue;
// must not have a place name in place of the street name
if ( a->m_street.m_flags2 & PLF2_IS_NAME ) continue;
// hash into table only if valid
int64_t h1 = a->m_name1.m_hash;
// adjust it since setHashes() xors in 0x123456 for street
// names that are actually place names in disguise
h1 ^= 0x123456;
// incorporate the adm1 and city and ctry
h1 = hash64 ( a->m_city.m_hash , h1 );
h1 = hash64 ( a->m_adm1.m_hash , h1 );
h1 = hash64 ( a->m_ctry.m_hash , h1 );
// put it in
if ( a->m_name1.m_strlen && ! pt.addKey ( (char *)&h1, &a ) )
return true;
// same for second place name
int64_t h2 = a->m_name2.m_hash;
// adjust it since setHashes() xors in 0x123456 for street
// names that are actually place names in disguise
h2 ^= 0x123456;
// incorporate the adm1 and city and ctry
h2 = hash64 ( a->m_city.m_hash , h2 );
h2 = hash64 ( a->m_adm1.m_hash , h2 );
h2 = hash64 ( a->m_ctry.m_hash , h2 );
// hash into table only if valid
if ( a->m_name2.m_strlen && ! pt.addKey ( (char *)&h2, &a ) )
return true;
}
// now scan our addresses that have a place name in place of
// the street name and see if we can get a match
for ( int32_t i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// get it
Address *a = &m_addresses[i];
// we want a place name in place of the street name now
if ( ! ( a->m_street.m_flags2 & PLF2_IS_NAME ) ) continue;
// . USE the STREET here, not the name