Skip to content
This repository has been archived by the owner on Dec 27, 2024. It is now read-only.

Commit

Permalink
Merge pull request #4 from ishanm0/feature-scraper
Browse files Browse the repository at this point in the history
Modified scraper
  • Loading branch information
jian-li1 authored Nov 7, 2024
2 parents 8519662 + 50c273c commit 553f2f5
Show file tree
Hide file tree
Showing 5 changed files with 226 additions and 18 deletions.
166 changes: 166 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
# API key files
*google.json
*openai.json

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
26 changes: 19 additions & 7 deletions backend/app.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import io
from threading import Thread
import os
import json

from src.config import DATA_DIR_PATH
from src.File.FileManager import create_unique_filename
Expand All @@ -11,17 +14,26 @@
WebCrawler,
)

def upload_to_cloud(item):
filename = item["url"].replace("https://", "").replace("http://", "").replace("/", "-")+".json"
filename = os.path.join(DATA_DIR_PATH, filename)
upload_file(io.BytesIO(json.dumps(item, indent=4).encode("utf-8")), filename)
logger.info(f"Content from {item['url']} saved to {filename}")

if __name__ == "__main__":
start_url = "https://www.ucsc.edu/"
start_url = "https://admissions.ucsc.edu/"
base_url = "ucsc.edu"

max_depth = 10

crawler = WebCrawler(SessionManager, LinkResolver, ContentExtractor)
crawled_data = crawler.crawl(start_url, max_depth)
crawled_data = crawler.crawl(start_url, base_url, max_depth)
num_crawled = 0

for item in crawled_data:
filename = create_unique_filename(item["url"], DATA_DIR_PATH)
upload_file(io.BytesIO(item["text"].encode("utf-8")), filename)
logger.info(f"Content from {item['url']} saved to {filename}")

logger.info(f"Total pages crawled: {len(crawled_data)}")
thread = Thread(target=upload_to_cloud, args=(item,))
thread.start()
num_crawled += 1
logger.info(f"Total pages crawled: {num_crawled}")
logger.info("All crawled data has been saved to individual files.")
2 changes: 1 addition & 1 deletion backend/src/Web/GoogleCloudStorage.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def upload_file(file_stream, filename):
return f"File '{filename}' uploaded successfully."
except Exception as e:
logger.error(f"Failed to upload file '{filename}': {str(e)}")
raise Exception(f"Failed to upload file: {str(e)}")
# raise Exception(f"Failed to upload file: {str(e)}")


def download_file(filename):
Expand Down
47 changes: 38 additions & 9 deletions backend/src/Web/WebCrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from bs4 import BeautifulSoup
from src.Logging.Logging import logger
from src.Web.SSLAdapter import SSLAdapter
import markdownify
import re
import tldextract


class SessionManager:
Expand All @@ -30,16 +33,36 @@ class LinkResolver:
"""Handles URL resolution and filtering of links on a page."""

@staticmethod
def resolve_links(base_url, soup, visited):
def resolve_links(url, soup, visited):
links = []
for link in soup.find_all("a", href=True):
href = link["href"]
if not href.startswith("http"):
href = requests.compat.urljoin(base_url, href)
href = requests.compat.urljoin(url, href)
if href not in visited:
links.append(href)
return links

@staticmethod
def resolve_links(url, base_url, soup, visited):
links = []
for link in soup.find_all("a", href=True):
href = link["href"]
if not href.startswith("http"):
href = requests.compat.urljoin(url, href)
if href not in visited and LinkResolver.same_domain(href, base_url) and not "#" in href:
links.append(href)
return links

@staticmethod
def same_domain(url, base_url):
sub_parts = tldextract.extract(url)
parent_parts = tldextract.extract(base_url)

# Compare the root domain and suffix
return (sub_parts.domain == parent_parts.domain and
sub_parts.suffix == parent_parts.suffix)


class ContentExtractor:
"""Extracts and cleans text content from a given HTML page."""
Expand All @@ -55,7 +78,14 @@ def extract_main_text(soup):
return main_content.get_text(separator="\n", strip=True)
else:
return "No <main> content found."

@staticmethod
def convert_to_md(soup):
html = soup.find("main") or soup.find("body") or soup
text = markdownify.markdownify(str(html), strip=["a", "img"])
text = re.sub(r"\n\n+", "\n", text).strip()

return text

class WebCrawler:
"""Crawls a website up to a given depth and returns page content."""
Expand All @@ -70,10 +100,9 @@ def __init__(
self.link_resolver = link_resolver
self.content_extractor = content_extractor

def crawl(self, start_url, max_depth):
def crawl(self, start_url, base_url, max_depth):
visited = set()
queue = deque([(start_url, 0)])
data = []

while queue:
url, depth = queue.popleft()
Expand All @@ -88,13 +117,13 @@ def crawl(self, start_url, max_depth):
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

text = self.content_extractor.extract_main_text(soup)
data.append({"url": url, "depth": depth, "text": text})
new_links = self.link_resolver.resolve_links(url, soup, visited)
text = self.content_extractor.convert_to_md(soup)
data = {"url": url, "title": soup.find('title').text, "depth": depth, "text": text}
new_links = self.link_resolver.resolve_links(url, base_url, soup, visited)
for link in new_links:
queue.append((link, depth + 1))

yield data

except Exception as e:
logger.error(f"Error crawling {url}: {str(e)}")

return data
3 changes: 2 additions & 1 deletion backend/src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@

DataFileName = f"{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.data"
DATA_FILE_PATH = f".data/{FolderYear}/{FolderDate}/{DataFileName}"
DATA_DIR_PATH = f"/{FolderDate}"
# DATA_DIR_PATH = f"{FolderDate}/"
DATA_DIR_PATH = "data/"
os.makedirs(os.path.dirname(DATA_DIR_PATH), exist_ok=True)

0 comments on commit 553f2f5

Please sign in to comment.