-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsift.py
450 lines (353 loc) · 15.9 KB
/
sift.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
"""
Sequel Sift: A web scraping and analysis tool for startup company websites.
This module provides functionality to extract and analyze information from startup
company websites, including company names, descriptions, founder details, and
product information. It uses BeautifulSoup for HTML parsing and NLTK for text
analysis.
Main Components:
- SequelSift: Main class for website analysis
- text_cleaner: Utility function for text normalization
- extract_company_name: Function for company name extraction from text
- analyze_phrase: Helper function for text analysis using NLTK
Dependencies:
- requests: For making HTTP requests
- beautifulsoup4: For HTML parsing
- nltk: For natural language processing
- pandas: For data organization
- re: For regular expressions
- urllib: For URL handling
Example Usage:
>>> analyzer = SequelSift()
>>> results = analyzer.analyze_website('example.com')
>>> df = pd.DataFrame([results])
Author: Emmanuel Ezenwere
Version: 1.0.0
"""
import re
import time
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
nltk.download('punkt') # Tokenization
nltk.download('words') # Wordlist for English words
nltk.download('maxent_ne_chunker') # For Named Entity Recognition
nltk.download('averaged_perceptron_tagger') # POS Tagger
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords') # Common stopwords list
nltk.download('wordnet') # WordNet lexical database
def analyze_phrase(text):
"""
Analyze a phrase for company name extraction
Returns (phrase, number_of_proper_nouns, word count)
"""
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
proper_nouns = [word for word, tag in tagged if tag == 'NNP']
is_single_word = len(tagged) == 1 and tagged[0][1].startswith('NN')
if proper_nouns:
return ' '.join(proper_nouns), len(proper_nouns), len(tokens)
elif is_single_word:
return tagged[0][0], 0, len(tokens)
return None, 0, len(tokens)
def extract_company_name(text):
"""
Extract company name from text using NLTK POS tagging
Args:
text (str): Input text containing company name
Returns:
str: Most likely company name
"""
parts = [part.strip() for part in text.split('|')]
if len(parts) != 2:
return None
left_phrase, left_proper_count, left_word_count = analyze_phrase(parts[0])
right_phrase, right_proper_count, right_word_count = analyze_phrase(parts[1])
# If one side has all NNPs return it and if both have return the shorter one.
if left_proper_count == left_word_count and right_proper_count == right_word_count:
return left_phrase if left_word_count < right_word_count else right_phrase
if left_proper_count == left_word_count:
return left_phrase
if right_proper_count == right_word_count:
return right_phrase
# If not all NNPs, return the shorter one
return left_phrase if left_word_count < right_word_count else right_phrase
def text_cleaner(text):
"""Cleans and normalizes text by removing special characters and formatting.
Processes text through the following steps:
1. Splits text into sentences
2. Removes all non-alphanumeric characters
3. Strips whitespace
4. Joins sentences with commas
Args:
text (str): Raw text string to be cleaned
Returns:
str: Cleaned text with sentences joined by commas
Example:
>>> text_cleaner("Hello, world! This is a test.")
"Hello world,This is a test"
Note:
- Preserves only letters, numbers, and spaces
- Removes punctuation, special characters, and extra whitespace
- Maintains sentence boundaries using commas
"""
# Tokenize text into sentences
sentences = sent_tokenize(text)
# Replace non-alphanumeric characters with spaces in each sentence
processed_sentences = [re.sub(r'[^a-zA-Z0-9 ]+', ' ', sentence).strip()
for sentence in sentences]
# Join sentences with commas
return ','.join(processed_sentences)
class SequelSift:
"""A class for analyzing company websites to extract key business information.
This class provides methods to scrape and analyze company websites, extracting
information such as company names, descriptions, founder details, and product
information. It handles URL normalization, page fetching, and HTML parsing.
Attributes:
headers (dict): HTTP headers used for web requests, including user agent
Methods:
analyze_website: Main method to analyze a company's website
_extract_company_name: Extracts company name from HTML
_extract_description: Extracts company/product description
_find_founders: Extracts founder information
_extract_product_info: Extracts product-related information
_find_about_page: Locates company's about/team page
Example Usage:
analyzer = SequelSift()
result = analyzer.analyze_website('example.com')
print(result['company_name'])
print(result['description'])
"""
def __init__(self):
"""Initialize with headers and retry settings"""
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
self.max_retries = 3
self.retry_delay = 2 # seconds
def _fetch_with_retry(self, url):
"""Fetches a webpage with retry logic for reliability.
Args:
url (str): URL to fetch
Returns:
BeautifulSoup | None: Parsed HTML content or None if all retries fail
"""
for attempt in range(self.max_retries):
try:
response = requests.get(url, headers=self.headers, timeout=10)
response.raise_for_status() # Raise an HTTPError for bad responses
return BeautifulSoup(response.text, 'html.parser')
except Exception as e:
print(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
# Don't sleep on last attempt
if attempt < self.max_retries - 1:
print("re-attempting extraction")
time.sleep(self.retry_delay * (attempt + 1)) # Exponential backoff
return None
def analyze_website(self, domain: str) -> dict:
"""Analyzes a website with retry logic for reliability."""
result = {
'domain': None,
'company_name': None,
'description': None,
'founders': None,
'product_info': None
}
try:
# Ensure domain has proper format
if not domain.startswith(('http://', 'https://')):
if not domain.startswith('www.'):
domain = 'www.' + domain
domain = 'https://' + domain
result['domain'] = domain
# Fetch main page with retry
soup = self._fetch_with_retry(domain)
if soup is None:
print(f"Failed to fetch {domain} after {self.max_retries} attempts")
return result
# Extract information
result['company_name'] = self._extract_company_name(soup)
result['description'] = self._extract_description(soup)
result['founders'] = self._find_founders(soup)
result['product_info'] = self._extract_product_info(soup)
# Try to find and fetch about page
about_page = self._find_about_page(domain, soup)
if about_page:
about_soup = self._fetch_with_retry(about_page)
if about_soup and result['founders'] is not None:
result['founders'].update(self._find_founders(about_soup))
return result
except Exception as e:
print(f'Error analyzing {domain}: {str(e)}')
return result
def _extract_company_name(self, soup):
"""Extracts company name from webpage HTML content.
Attempts to find company name from multiple sources in HTML:
1. Meta tags (og:site_name)
2. Page title tag
Args:
soup (BeautifulSoup): Parsed HTML content in BeautifulSoup format
Returns:
str | None: First found company name from potential sources,
or None if no company name could be extracted
"""
potential_names = []
# Check meta tags
meta_title = soup.find('meta', property='og:site_name')
if meta_title:
company_name = meta_title['content']
potential_names.append(company_name)
# Check main title
title = soup.find('title')
if title:
company_name = extract_company_name(title.text)
potential_names.append(company_name)
return potential_names[0] if potential_names else None
def _extract_description(self, soup):
"""Extracts website description from webpage HTML content.
Searches for description in following priority order:
1. Meta description tag
2. First paragraph text
The extracted text is cleaned before being returned.
Args:
soup (BeautifulSoup): Parsed HTML content in BeautifulSoup format
Returns:
str | None: Cleaned description text if found,
None if no description could be extracted
"""
# Try to find meta description
meta_desc = soup.find('meta', {'name': 'description'})
if meta_desc:
return text_cleaner(meta_desc.get('content'))
# Try to find first meaningful paragraph
first_p = soup.find('p')
if first_p:
return text_cleaner(first_p.text.strip())
return None
def _find_founders(self, soup):
"""Extracts founder names from webpage HTML content.
Searches through various HTML elements (p, div, headers) for founder-related
keywords and attempts to extract associated names. Looks for text patterns
where names typically appear before founder-related titles.
Args:
soup (BeautifulSoup): Parsed HTML content in BeautifulSoup format
Returns:
set[str] | None: Set of cleaned founder names if found,
None if no founders could be identified or on error
Example extracted patterns:
"John Smith, Founder"
"Jane Doe, CEO"
"Bob Wilson, Co-Founder & CTO"
"""
try:
founders = set()
founder_keywords = ['founder', 'co-founder', 'ceo', 'chief executive']
# Look for team sections or about sections
for elem in soup.find_all(['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
text = elem.text.lower()
if any(keyword in text for keyword in founder_keywords):
# Simple approach to extract names
words = text.split()
for i in range(len(words)-1):
if any(keyword in words[i] for keyword in founder_keywords):
# Look for name before the founder keyword
potential_name = ' '.join(words[max(0, i-2):i]).strip()
if potential_name and len(potential_name.split()) >= 2:
founders.add(text_cleaner(potential_name))
if founders == {}:
return None
return founders
except Exception:
return None
def _extract_product_info(self, soup):
"""Extracts product-related information from webpage HTML content.
Searches for product information in three main areas:
1. Feature headers (class='feature-header')
2. Product block details (class='product-block-details')
3. Product list titles (class='product-list-title')
Args:
soup (BeautifulSoup): Parsed HTML content in BeautifulSoup format
Returns:
dict[str, list[str]]: Dictionary containing product information with keys:
- products: List of product names/titles
- features: List of product features/highlights
- descriptions: List of product descriptions
Note:
Duplicates are removed while preserving the order of discovery.
All text values are stripped of leading/trailing whitespace.
"""
product_info = {
'products': [],
'features': [],
'descriptions': []
}
# Extract from feature headers
feature_headers = soup.find_all('div', class_='feature-header')
for header in feature_headers:
h3 = header.find('h3')
if h3:
product_info['products'].append(h3.text.strip())
# Extract from product block details
product_blocks = soup.find_all('div', class_='product-block-details')
for block in product_blocks:
title = block.find('h3', class_='product-block-title')
if title:
product_info['products'].append(title.text.strip())
# Extract from product list titles
list_titles = soup.find_all('div', class_='product-list-title')
for title_block in list_titles:
h2 = title_block.find('h2')
p = title_block.find('p')
if h2:
product_info['features'].append(h2.text.strip())
if p:
product_info['descriptions'].append(p.text.strip())
# Remove duplicates while preserving order
for key in product_info:
product_info[key] = list(dict.fromkeys(product_info[key]))
return product_info
def _find_about_page(self, base_url, soup):
"""Finds the URL of the company's about or team page.
Searches for links containing 'about' or 'team' in their href attributes
(case-insensitive) and constructs the full URL using the base URL.
Args:
base_url (str): The website's base URL (e.g., 'https://example.com')
soup (BeautifulSoup): Parsed HTML content in BeautifulSoup format
Returns:
str | None: Full URL of the about/team page if found,
None if no relevant page could be found
Example:
base_url: 'https://example.com'
found href: '/about-us'
returns: 'https://example.com/about-us'
"""
about_links = soup.find_all('a', href=re.compile(r'about|team', re.I))
if about_links:
return urljoin(base_url, about_links[0]['href'])
return None
startup_domains = [
'tonestro.com',
'sendtrumpet.com',
'prewave.com',
'twinn.health',
'kokoon.io'
]
def main(domains):
""" Run scraping on a list of websites.
Args:
domains (list): list of strings of company domains.
"""
print("\n")
print("-"*50)
print("Sequel Sift -- Extracting Startup data...")
print("-"*50)
print("\n")
analyzer = SequelSift()
results = [analyzer.analyze_website(domain) for domain in domains]
return pd.DataFrame(results)
if __name__ == "__main__":
company_infos = main(startup_domains)
print(company_infos)