|
1 | 1 | import json
|
2 | 2 | import re
|
| 3 | +import time |
3 | 4 |
|
4 | 5 | from examples.mockingbird.mockingbooks_py.entities import EntitiesFilter, search_entities
|
5 |
| - |
6 |
| -# Path to the file |
7 |
| -file_path = 'tmp/tmp.txt' |
8 |
| - |
9 |
| -# Open the file and read its contents |
10 |
| -with open(file_path, 'r') as file: |
11 |
| - text = file.read() |
12 |
| - |
13 |
| -# Define two patterns |
14 |
| -pattern1 = r'data-anonymize="person-name">\s*([^<]+?)\s*</a>' |
15 |
| -pattern2 = r'<span data-anonymize="company-name">\s*([^<]+?)\s*</span>' |
| 6 | +from examples.mockingbird.mockingbooks_py.google_search_regex.dynamic_google_search import start_wf |
16 | 7 |
|
17 | 8 | agg_prompt = ("Can you summarize the relevant person and/or associated business entity from the search results? It "
|
18 | 9 | "should use the most relevant search result that matches best and ignore others to prevent mixing"
|
|
23 | 14 | "with the metadata sources that are associated from that platform, business, ie. LinkedIn, Twitter, etc."
|
24 | 15 | "so that we associate the correct entity metadata with the correct platforms.")
|
25 | 16 |
|
26 |
| -# Find all matches for both patterns |
27 |
| -matches1 = re.findall(pattern1, text) |
28 |
| -matches2 = re.findall(pattern2, text) |
29 | 17 |
|
30 |
| -# Check if lengths are equal |
31 |
| -if len(matches1) > len(matches2): |
32 |
| - matches1, matches2 = matches2, matches1 |
| 18 | +# Function to process each <li> element |
| 19 | +def process_li_element(li_text): |
| 20 | + # Define the patterns for extracting person's name and company |
| 21 | + name_pattern = r'data-anonymize="person-name">\s*([^<]+?)\s*</span>' |
| 22 | + # Updated company pattern to stop at the first '<' character, ensuring no '>' is included |
| 23 | + company_pattern = r'data-anonymize="company-name"[^>]*>\s*([^<]+?)\s*<' |
| 24 | + |
| 25 | + # Search for name and company within the <li> element text |
| 26 | + name_match = re.search(name_pattern, li_text) |
| 27 | + company_match = re.search(company_pattern, li_text) |
| 28 | + |
| 29 | + # If both name and company are found, return them |
| 30 | + if name_match and company_match: |
| 31 | + return name_match.group(1), company_match.group(1) |
| 32 | + else: |
| 33 | + return None |
| 34 | + |
| 35 | + |
| 36 | +def iterate_on_matches(): |
| 37 | + # List to store all matches |
| 38 | + all_matches = [] |
| 39 | + count = 0 |
| 40 | + |
| 41 | + # Process files from page1 to page60 |
| 42 | + for page_num in range(1, 61): |
| 43 | + |
| 44 | + file_path = f'tmp/page{page_num}.txt' |
| 45 | + |
| 46 | + # Open the file and read its contents |
| 47 | + with open(file_path, 'r') as file: |
| 48 | + text = file.read() |
| 49 | + # Assuming 'text' contains the HTML content from your file |
| 50 | + # Define the regex pattern to capture each <li> element |
| 51 | + li_pattern = r'<li class="artdeco-list__item[^>]*>.*?</li>' |
| 52 | + |
| 53 | + # Find all <li> elements |
| 54 | + li_elements = re.findall(li_pattern, text, re.DOTALL) |
33 | 55 |
|
34 |
| -offset_l = 80 |
35 |
| -offset_r = 100 |
| 56 | + # Process each <li> element to extract names and companies |
| 57 | + matches = [process_li_element(li) for li in li_elements] |
36 | 58 |
|
37 |
| -# skip next |
38 |
| -# for i in range(len(matches1)): |
39 |
| -# person_company = f"{i}:{matches1[i]} (company)" |
40 |
| -# if 0 + offset_l < i < 1 + offset_r: |
41 |
| -# print(person_company) |
42 |
| -# start_wf(person_company, agg_prompt) |
| 59 | + # Filter out any None results |
| 60 | + matches = [match for match in matches if match is not None] |
43 | 61 |
|
| 62 | + # Iterate and print each match |
| 63 | + for name, company in matches: |
| 64 | + count += 1 |
44 | 65 |
|
45 |
| -# Example iteration and action simulation with 'start_wf' function. |
46 |
| -# Since the 'start_wf' function is a conceptual example, we'll simulate its operation as a print statement. |
47 |
| -# for person_company in formatted_people_companies: |
48 |
| -# # Simulate calling 'start_wf' function with the person_company tuple and aggregated prompt. |
49 |
| -# start_wf(person_company, agg_prompt) |
50 |
| -# |
| 66 | + if count > 722: |
| 67 | + nc = f'Name: {name}, Company: {company}' |
| 68 | + print(nc) |
| 69 | + start_wf(nc, agg_prompt) |
| 70 | + all_matches += [(name, company)] |
| 71 | + count += 1 |
| 72 | + time.sleep(10) |
| 73 | + print(count) |
51 | 74 |
|
52 |
| -# skip next |
53 | 75 |
|
54 | 76 | if __name__ == '__main__':
|
| 77 | + iterate_on_matches() |
55 | 78 | search_entities_f = EntitiesFilter()
|
56 | 79 |
|
57 | 80 | pretty_data1 = search_entities(search_entities_f)
|
|
0 commit comments