From b50228848c5d2c3b13f81fa7a472f117c76db465 Mon Sep 17 00:00:00 2001
From: Abolfazl Andalib <79583121+abolfazl8131@users.noreply.github.com>
Date: Wed, 11 Dec 2024 17:10:01 +0330
Subject: [PATCH] fix(crawler): Re-design a crowler (#141)

* feat(compose): comelete compose prompt

* fix(kuber): remove lb

* feat(compose): compelete compose prompt

* nothing

* fix(compose): totally restructre docker compose generator

* fix(compose): directory builder

* fix(compose): compelete compose generation allgorithm

* fix(compose): edit default values for documentation

* feat(compose): add union type input for networks

* fix(routes): add /api to all routes

* fix(installation): fix terraform installation process and model

* fix(installation): create MyBash for scripts

* fix(bash): edit bi/bash

* fix(docker install): fix it

* feat(install): add jenkins and gitlab installation

* Update unit-test.yml

* fix(install): fix jenkins and gitlab

* fix(crawler): fix the crowler to crawl 2 aws urls
---
 admin-panel                                   |   2 +-
 crawl/content_parser.py                       | 105 ------------
 ...Amazon EC2 instance types - Amazon EC2.txt |  79 +++++++++
 ...on EC2? - Amazon Elastic Compute Cloud.txt | 151 ++++++++++++++++++
 crawl/main.py                                 | 131 +++++----------
 crawl/readme.md                               |  39 -----
 crawl/urls.csv                                |   1 -
 7 files changed, 272 insertions(+), 236 deletions(-)
 delete mode 100644 crawl/content_parser.py
 create mode 100644 crawl/crawled_data/Amazon EC2 instance types - Amazon EC2.txt
 create mode 100644 crawl/crawled_data/What is Amazon EC2? - Amazon Elastic Compute Cloud.txt
 delete mode 100644 crawl/readme.md
 delete mode 100644 crawl/urls.csv

diff --git a/admin-panel b/admin-panel
index 5b9c0c12..bfa06012 160000
--- a/admin-panel
+++ b/admin-panel
@@ -1 +1 @@
-Subproject commit 5b9c0c123018e42b185681bb955c7a8b48b6b7f8
+Subproject commit bfa06012cc943bdb1a59fde5fe235be06840005d
diff --git a/crawl/content_parser.py b/crawl/content_parser.py
deleted file mode 100644
index 9e03e97c..00000000
--- a/crawl/content_parser.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import requests
-from bs4 import BeautifulSoup
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry
-
-class WebContentParser:
-    def __init__(self, url):
-        self.url = url
-        self.headers = {
-            'User-Agent': (
-                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) '
-                'AppleWebKit/537.36 (KHTML, like Gecko) '
-                'Chrome/50.0.2661.102 Safari/537.36'
-            )
-        }
-        self.session = self._initialize_session()
-        self.main_response = None
-        self.all_page_data = []
-
-    def _initialize_session(self):
-        """Set up the session with retry strategy."""
-        retry_strategy = Retry(
-            total=5,
-            backoff_factor=8,
-        )
-        adapter = HTTPAdapter(max_retries=retry_strategy)
-        adapter.max_retries.respect_retry_after_header = False
-
-        session = requests.Session()
-        session.mount("https://", adapter)
-        session.mount("http://", adapter)
-        return session
-
-    def fetch_content(self):
-        """Fetch the main content from the URL."""
-        try:
-            self.main_response = self.session.get(
-                self.url, verify=False, timeout=30, headers=self.headers
-            )
-            print(f'URL fetched: {self.url}')
-            return self.main_response
-        except requests.RequestException as e:
-            print(f"Failed to fetch the URL: {e}")
-            return None
-
-    def parse_content(self):
-        """Parse the fetched HTML content."""
-        if not self.main_response:
-            print("No response available to parse.")
-            return []
-
-        main_soup = BeautifulSoup(self.main_response.content, 'html.parser')
-        datas = main_soup.find('main', {'id': 'main'})
-        if not datas:
-            print("No 'main' element found.")
-            return []
-
-        all_tag = datas.find_all(['h1', 'h2', 'h3', 'p', 'blockquote', 'ul'])
-        each_title_data = {}
-
-        for tag in all_tag:
-            if tag.name in ['h1', 'h2']:
-                if each_title_data:
-                    self.all_page_data.append(each_title_data)
-                    each_title_data = {}
-                each_title_data['metadata'] = tag.text.strip()
-
-            elif tag.name == 'h3':
-                if tag.text.strip() == 'Resources':
-                    each_title_data[tag.text.strip()] = ''
-                else:
-                    if each_title_data:
-                        self.all_page_data.append(each_title_data)
-                        each_title_data = {}
-                    each_title_data['metadata'] = tag.text.strip()
-
-            elif tag.name in ['p', 'blockquote']:
-                num = len(each_title_data)
-                key = f'content {num}'
-                if tag.text.strip():
-                    each_title_data[key] = tag.text.strip()
-
-            elif tag.name == 'ul':
-                text = ' '.join(
-                    li.text.strip()
-                    for li in tag.find_all('li', {'class': 'mdx-lists_listItem__nkqhg'})
-                )
-                if 'Resources' in each_title_data:
-                    each_title_data['Resources'] = text
-                else:
-                    num = len(each_title_data)
-                    key = f'content {num}'
-                    if text:
-                        each_title_data[key] = text
-
-        if each_title_data:
-            self.all_page_data.append(each_title_data)
-
-        return self.all_page_data
-
-    def get_data(self):
-        """Main method to fetch and parse content."""
-        self.fetch_content()
-        return self.parse_content()
-
diff --git a/crawl/crawled_data/Amazon EC2 instance types - Amazon EC2.txt b/crawl/crawled_data/Amazon EC2 instance types - Amazon EC2.txt
new file mode 100644
index 00000000..861c4cda
--- /dev/null
+++ b/crawl/crawled_data/Amazon EC2 instance types - Amazon EC2.txt	
@@ -0,0 +1,79 @@
+Title: Amazon EC2 instance types - Amazon EC2
+
+When you launch an EC2 instance, the instance type that you specify
+        determines the hardware of the host computer used for your instance. Each instance type
+        offers different compute, memory, and storage capabilities, and is grouped in an instance
+        family based on these capabilities. Select an instance type based on the requirements of the
+        application or software that you plan to run on your instance.
+Amazon EC2 dedicates some resources of the host computer, such as CPU, memory, and instance
+        storage, to a particular instance. Amazon EC2 shares other resources of the host computer, such
+        as the network and the disk subsystem, among instances. If each instance on a host computer
+        tries to use as much of one of these shared resources as possible, each receives an equal
+        share of that resource. However, when a resource is underused, an instance can consume a
+        higher share of that resource while it's available.
+Each instance type provides higher or lower minimum performance from a shared resource.
+        For example, instance types with high I/O performance have a larger allocation of shared resources. 
+        Allocating a larger share of shared resources also reduces the variance of I/O performance. 
+        For most applications, moderate I/O performance is more than enough. However, for
+        applications that require greater or more consistent I/O performance, consider
+        an instance type with higher I/O performance.
+Current generation instances
+Previous generation instances
+Amazon EC2 instance type naming conventions
+Amazon EC2 instance type specifications
+Instances built on the AWS Nitro System
+Amazon EC2 instance type quotas
+For the best performance, we recommend that you use the following instance types
+            when you launch new instances. For more information, see Amazon EC2 Instance Types.
+General purpose: M5 | M5a | M5ad | M5d | M5dn | M5n | M5zn | M6a | M6g | M6gd | M6i | M6id | M6idn | M6in | M7a | M7g | M7gd | M7i | M7i-flex | M8g | Mac1 | Mac2 | Mac2-m1ultra | Mac2-m2 | Mac2-m2pro | T2 | T3 | T3a | T4g
+Compute optimized: C5 | C5a | C5ad | C5d | C5n | C6a | C6g | C6gd | C6gn | C6i | C6id | C6in | C7a | C7g | C7gd | C7gn | C7i | C7i-flex | C8g
+Memory optimized: R5 | R5a | R5ad | R5b | R5d | R5dn | R5n | R6a | R6g | R6gd | R6i | R6idn | R6in | R6id | R7a | R7g | R7gd | R7i | R7iz | R8g | U-3tb1 | U-6tb1 | U-9tb1 | U-12tb1 | U-18tb1 | U-24tb1 | U7i-6tb | U7i-8tb | U7i-12tb | U7in-16tb | U7in-24tb | U7in-32tb | X1 | X1e | X2gd | X2idn | X2iedn | X2iezn | X8g | z1d
+Storage optimized: D2 | D3 | D3en | H1 | I3 | I3en | I4g | I4i | I7ie | I8g | Im4gn | Is4gen
+Accelerated computing: DL1 | DL2q | F1 | G4ad | G4dn | G5 | G5g | G6 | G6e | Gr6 | Inf1 | Inf2 | P2 | P3 | P3dn | P4d | P4de | P5 | P5e | P5en | Trn1 | Trn1n | Trn2 | Trn2u | VT1
+High-performance computing: Hpc6a | Hpc6id | Hpc7a | Hpc7g
+Amazon Web Services offers previous generation instance types for users who have optimized their
+            applications around them and have yet to upgrade. We encourage you to use current generation 
+            instance types to get the best performance, but we continue to support the following previous 
+            generation instance types. For more information about which current 
+                generation instance type would be a suitable upgrade, see 
+                    Previous Generation Instances.
+General purpose: A1 | M1 | M2 | M3 | M4 | T1
+Compute optimized: C1 | C3 | C4
+Memory optimized: R3 | R4
+Storage optimized: I2
+Accelerated computing: G3
+Fixed performance instances provide fixed CPU resources. These instances can
+                deliver and sustain full CPU performance at any time, and for as long as a workload
+                needs it. If you need consistently high CPU performance for applications such as
+                video encoding, high volume websites, or HPC applications, we recommend that you use
+                fixed performance instances.
+Burstable performance (T) instances provide a baseline level of CPU
+                performance with the ability to burst above the baseline. The baseline CPU is
+                designed to meet the needs of the majority of general purpose workloads, such as
+                large-scale micro-services, web servers, small and medium databases, data logging,
+                code repositories, virtual desktops, and development and test environments.
+The baseline utilization and ability to burst are governed by CPU credits. Each
+            burstable performance instance continuously earns credits when it stays below the CPU
+            baseline, and continuously spends credits when it bursts above the baseline. For more
+            information, see Burstable
+                performance instances in the Amazon EC2 User Guide.
+M7i-flex and C7i-flex instances offer a balance of compute, memory, and network
+                resources, and they provide the most cost-effective way to run a broad spectrum of
+                general purpose applications. These instances provide reliable CPU resources to
+                deliver a baseline CPU performance of 40 percent, which is designed to meet the
+                compute requirements for a majority of general purpose workloads. When more
+                performance is needed, these instances provide the ability to exceed the baseline
+                CPU performance and deliver up to 100 percent CPU performance for 95 percent of the
+                time over a 24-hour window.
+M7i-flex and C7i-flex instances running at a high CPU utilization that is consistently
+            above the baseline for long periods of time might see a gradual reduction in the maximum
+            burst CPU throughput. For more information, see M7i-flex instances and C7i-flex instances.
+For pricing information, see Amazon EC2 Pricing.
+ Javascript is disabled or is unavailable in your browser.
+To use the Amazon Web Services Documentation, Javascript must be enabled. Please refer to your browser's Help pages for instructions.
+Thanks for letting us know we're doing a good job!
+If you've got a moment, please tell us what we did right so we can do more of it.
+
+Thanks for letting us know this page needs work. We're sorry we let you down.
+If you've got a moment, please tell us how we can make the documentation better.
+
diff --git a/crawl/crawled_data/What is Amazon EC2? - Amazon Elastic Compute Cloud.txt b/crawl/crawled_data/What is Amazon EC2? - Amazon Elastic Compute Cloud.txt
new file mode 100644
index 00000000..d0e78fd3
--- /dev/null
+++ b/crawl/crawled_data/What is Amazon EC2? - Amazon Elastic Compute Cloud.txt	
@@ -0,0 +1,151 @@
+Title: What is Amazon EC2? - Amazon Elastic Compute Cloud
+
+Amazon Elastic Compute Cloud (Amazon EC2) provides on-demand, scalable computing capacity in the Amazon Web
+		Services (AWS) Cloud. Using Amazon EC2 reduces hardware costs so you can develop and deploy
+		applications faster. You can use Amazon EC2 to launch as many or as few virtual servers as you
+		need, configure security and networking, and manage storage. You can add capacity (scale up)
+		to handle compute-heavy tasks, such as monthly or yearly processes, or spikes in website
+		traffic. When usage decreases, you can reduce capacity (scale down) again.
+An EC2 instance is a virtual server in the AWS Cloud. When you launch an EC2 instance,
+    	the instance type that you specify determines the hardware available to your instance. 
+    	Each instance type offers a different balance of compute, memory, network, and storage 
+    	resources. For more information, see the Amazon EC2 Instance Types Guide.
+Amazon EC2 provides the following high-level features:
+Virtual servers.
+Preconfigured templates for your instances that package the components you
+						need for your server (including the operating system and additional
+						software).
+Various configurations of CPU, memory, storage, networking capacity, and
+						graphics hardware for your instances.
+Persistent storage volumes for your data using Amazon Elastic Block Store (Amazon EBS).
+Storage volumes for temporary data that is deleted when you stop,
+						hibernate, or terminate your instance.
+Secure login information for your instances. AWS stores the public key
+						and you store the private key in a secure place.
+A virtual firewall that allows you to specify the protocols, ports, and
+						source IP ranges that can reach your instances, and the destination IP
+						ranges to which your instances can connect. 
+Amazon EC2 supports the processing, storage, and transmission 
+of credit card data by a merchant or service provider, and has been 
+validated as being compliant with Payment Card Industry (PCI) Data Security Standard (DSS). 
+For more information about PCI DSS, including how to request a copy of the AWS PCI Compliance Package, 
+see PCI DSS Level 1.
+
+You can use other AWS services with the instances that you deploy using Amazon EC2.
+Helps ensure you have the correct number of Amazon EC2 instances available to
+        				handle the load for your application.
+Automate backing up your Amazon EC2 instances and the Amazon EBS volumes attached to
+						them.
+Monitor your instances and Amazon EBS volumes.
+Automatically distribute incoming application traffic across multiple
+						instances.
+Detect potentially unauthorized or malicious use of your EC2 instances.
+Automate the creation, management, and deployment of customized, secure, and
+						up-to-date server images.
+Size, configure, and deploy AWS resources for third-party applications
+						without having to manually identify and provision individual AWS
+						resources.
+Perform operations at scale on EC2 instances with this secure end-to-end
+						management solution.
+You can launch instances using another AWS compute service instead of using Amazon EC2.
+Build websites or web applications using Amazon Lightsail, a cloud platform
+						that provides the resources that you need to deploy your project quickly, for
+						a low, predictable monthly price. To compare Amazon EC2 and Lightsail, see 
+						Amazon Lightsail or Amazon EC2.
+Deploy, manage, and scale containerized applications on a cluster of EC2
+						instances. For more information, see Choosing an AWS container service.
+Run your Kubernetes applications on AWS.  For more information, see 
+						Choosing an AWS container service.
+You can create and manage your Amazon EC2 instances using the following interfaces:
+A simple web interface to create and manage Amazon EC2 instances and resources.
+						If you've signed up for an AWS account, you can access the Amazon EC2 console
+						by signing into the AWS Management Console and selecting EC2 from
+						the console home page.
+Enables you to interact with AWS services using commands in your command-line shell. It
+						is supported on Windows, Mac, and Linux. For more information about the
+						AWS CLI , see AWS Command Line Interface User Guide. You can find the Amazon EC2 commands in the AWS CLI Command Reference.
+Amazon EC2 supports creating resources using AWS CloudFormation. You create a template, in JSON or YAML
+						format, that describes your AWS resources, and AWS CloudFormation provisions and
+						configures those resources for you. You can reuse your CloudFormation
+						templates to provision the same resources multiple times, whether in the
+						same Region and account or in multiple Regions and accounts. For more
+						information about supported resource types and properties for Amazon EC2, see
+							EC2 resource type
+							reference in the AWS CloudFormation User Guide.
+If you prefer to build applications using language-specific APIs instead
+						of submitting a request over HTTP or HTTPS, AWS provides libraries, sample
+						code, tutorials, and other resources for software developers. These
+						libraries provide basic functions that automate tasks such as
+						cryptographically signing your requests, retrying requests, and handling
+						error responses, making it easier for you to get started. For more
+						information, see 
+						Tools to Build
+								on AWS.
+A set of PowerShell modules that are built on the functionality exposed by
+						the AWS SDK for .NET. The Tools for PowerShell enable you to script operations on your AWS
+						resources from the PowerShell command line. To get started, see the
+							AWS Tools for Windows PowerShell User Guide. You can find the cmdlets for Amazon EC2, in the AWS Tools for PowerShell Cmdlet Reference.
+Amazon EC2 provides a Query API. These requests are HTTP or HTTPS requests that
+						use the HTTP verbs GET or POST and a Query parameter named
+						Action. For more information about the API actions for
+						Amazon EC2, see Actions in the
+						Amazon EC2 API Reference.
+Amazon EC2 provides the following pricing options:
+You can get started with Amazon EC2 for free. To explore the Free Tier options,
+						see AWS Free Tier.
+Pay for the instances that you use by the second, with a minimum of 60
+						seconds, with no long-term commitments or upfront payments.
+You can reduce your Amazon EC2 costs by making a commitment to a consistent
+						amount of usage, in USD per hour, for a term of 1 or 3 years.
+You can reduce your Amazon EC2 costs by making a commitment to a specific
+						instance configuration, including instance type and Region, for a term of 1
+						or 3 years.
+Request unused EC2 instances, which can reduce your Amazon EC2 costs
+						significantly.
+Reduce costs by using a physical EC2 server that is fully dedicated for
+						your use, either On-Demand or as part of a Savings Plan. You can use your
+						existing server-bound software licenses and get help meeting compliance
+						requirements. 
+Reserve compute capacity for your EC2 instances in a specific Availability
+						Zone for any duration of time.
+Removes the cost of unused minutes and seconds from your bill.
+For a complete list of charges and prices for Amazon EC2 and more information about the purchase
+			models, see Amazon EC2 pricing.
+To create estimates for your AWS use cases, use the AWS Pricing Calculator.
+To estimate the cost of transforming Microsoft
+					workloads to a modern architecture that uses open source and
+				cloud-native services deployed on AWS, use the AWS
+					Modernization Calculator for Microsoft Workloads.
+To see your bill, go to the Billing and Cost Management
+					Dashboard in the AWS Billing and Cost Management
+					console. Your bill contains links to usage reports that provide details
+				about your bill. To learn more about AWS account billing, see AWS Billing and Cost Management User
+				Guide.
+If you have questions concerning AWS billing, accounts, and events, contact AWS Support.
+To calculate the cost of a sample provisioned
+					environment, see Cloud Economics
+						Center. When calculating the cost of a provisioned
+				environment, remember to include incidental costs such as snapshot storage for EBS
+				volumes. 
+You can optimize the cost, security, and performance of your AWS environment
+				using AWS Trusted Advisor.
+You can use AWS Cost Explorer to analyze the cost and usage of your EC2 instances. You can view 
+				data up to the last 13 months, and forecast how much you are likely to spend for the next 
+				12 months. For more information, see
+				Analyzing your costs with
+					AWS Cost Explorer in the AWS Cost Management User Guide.
+Amazon EC2 features
+AWS re:Post
+AWS Skill Builder
+AWS Support
+Hands-on Tutorials
+Web Hosting
+Windows on AWS
+ Javascript is disabled or is unavailable in your browser.
+To use the Amazon Web Services Documentation, Javascript must be enabled. Please refer to your browser's Help pages for instructions.
+Thanks for letting us know we're doing a good job!
+If you've got a moment, please tell us what we did right so we can do more of it.
+
+Thanks for letting us know this page needs work. We're sorry we let you down.
+If you've got a moment, please tell us how we can make the documentation better.
+
diff --git a/crawl/main.py b/crawl/main.py
index f86e632e..3a4621e3 100644
--- a/crawl/main.py
+++ b/crawl/main.py
@@ -1,92 +1,43 @@
-
-import argparse
-import csv
-import logging
 import requests
 from bs4 import BeautifulSoup
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry
-from content_parser import WebContentParser
-
-
-def setup_logging():
-    logging.basicConfig(
-        level=logging.INFO,
-        format='%(asctime)s - %(levelname)s - %(message)s',
-        handlers=[logging.StreamHandler()]
-    )
-
-
-def setup_http_session():
-    retry_strategy = Retry(
-        total=5,
-        backoff_factor=8,
-    )
-    adapter = HTTPAdapter(max_retries=retry_strategy)
-    adapter.max_retries.respect_retry_after_header = False
-    session = requests.Session()
-    session.mount("https://", adapter)
-    session.mount("http://", adapter)
-    return session
-
-
-def process_urls(file_path, save_result):
-    http = setup_http_session()
-    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
-
-    with open(file_path, 'r') as file:
-        csv_reader = csv.reader(file)
-        for row in csv_reader:
-            if row:  # Check if the row is not empty
-                main_url = row[0]
-                try:
-                    main_response = http.get(main_url, verify=False, timeout=30, headers=headers)
-                    logging.info(f'Fetched URL: {main_url}')
-                except requests.RequestException as e:
-                    logging.error(f"Failed to fetch URL {main_url}: {e}")
-                    continue
-
-                main_soup = BeautifulSoup(main_response.content, 'html.parser')
-                products = main_soup.find('div', {'class': 'marketing-content_root__DE3hU'}).find_all('div', {'class': 'card-grid-block_root__yDdm_'})
-                logging.info(f'Found {len(products)} products on page: {main_url}')
-                all_data = []
-                for product in products:
-                    # Get org title
-                    title = product.find('h2').text
-                    sub_content_link=[]
-                    all_sub_title = product.find_all('li')
-                    for res in all_sub_title:
-                        sub_part_content = {}
-                        sub_part_content['main_title'] = title
-                        sub_title = res.find('span', {'class': 'card-title_text__F97Wj'}).get_text()
-                        sub_part_content['sub_title'] = sub_title
-                        sub_title_link = 'https://developer.hashicorp.com' + res.find('a').attrs['href']
-                        sub_part_content['sub_title_link'] = sub_title_link
-
-                        parser = WebContentParser(sub_title_link)
-                        data = parser.get_data()
-                        sub_part_content['all_data_info'] = data
-
-                        logging.info(f'Parsed content for sub-title: {sub_title}')
-                        sub_content_link.append(sub_part_content)
-                    all_data.append(sub_content_link)
-                if save_result:
-                    # Logic to save sub_part_content goes here (e.g., writing to a file or database)
-                    logging.info(f'Saving result for: {all_data}')
-                else:
-                    print(all_data)
-                          
-
-def main():
-    setup_logging()
-
-    parser = argparse.ArgumentParser(description='Process URLs from a CSV file.')
-    parser.add_argument('--csv_path', type=str, default='./urls.csv', help='Path to the CSV file containing URLs')
-    parser.add_argument('--save_result', type=bool, default=False, help='Flag to indicate if the results should be saved')
-    args = parser.parse_args()
-
-    process_urls(args.csv_path, args.save_result)
-
-
-if __name__ == '__main__':
-    main()
+import os
+
+# List of URLs to crawl
+urls = [
+    "https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/concepts.html",
+    "https://docs.aws.amazon.com/ec2/latest/instancetypes/instance-types.html#current-gen-instances"
+]
+
+# Directory to save the files
+save_dir = "crawled_data"
+os.makedirs(save_dir, exist_ok=True)
+
+def fetch_and_save(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()  # Check if the request was successful
+
+        # Parse the HTML content
+        soup = BeautifulSoup(response.text, 'html.parser')
+
+        # For demonstration, we are fetching the page title and all paragraphs
+        title = soup.title.string if soup.title else "no_title"
+        paragraphs = soup.find_all('p')
+
+        # Prepare the file name
+        file_name = os.path.join(save_dir, f"{title}.txt")
+        
+        # Write the content to the file
+        with open(file_name, 'w', encoding='utf-8') as file:
+            file.write(f"Title: {title}\n\n")
+            for para in paragraphs:
+                file.write(para.get_text() + "\n")
+
+        print(f"Saved content from {url} to {file_name}")
+
+    except requests.RequestException as e:
+        print(f"Failed to fetch {url}: {e}")
+
+# Fetch and save data from each URL
+for url in urls:
+    fetch_and_save(url)
diff --git a/crawl/readme.md b/crawl/readme.md
deleted file mode 100644
index 93e44d57..00000000
--- a/crawl/readme.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# Documentation for Web Content Scraper
-
-## Overview
-This script is designed to scrape data from a list of URLs provided in a CSV file. It fetches the content, extracts specific product information, and logs the operations performed. Optionally, the extracted content can also be saved. The script utilizes various libraries such as `requests`, `BeautifulSoup`, and `argparse` to ensure efficient and robust operation.
-
-## Prerequisites
-Make sure the following Python packages are installed:
-- `requests`
-- `beautifulsoup4`
-- `urllib3`
-
-To install the dependencies, run the following command:
-```sh
-pip install requests beautifulsoup4
-```
-## How to Use
-Arguments
-The script accepts command-line arguments that allow customization of behavior:
---csv_path: The path to the CSV file containing URLs to scrape. The default value is ./urls.csv.
---save_result: A boolean flag indicating whether to save the scraped results. The default value is False.
-## Running the Script
-You can run the script by using the following command:
-
-```sh
-Copy code
-python main.py --csv_path <path_to_csv> --save_result <True/False>
-```
-For example:
-```sh
-Copy code
-python main.py --csv_path ./urls.csv --save_result True
-```
-## CSV File Format
-The CSV file should contain a list of URLs, with each URL on a new line. Here is an example:
-```
-https://example.com/page1
-https://example.com/page2
-```
-
diff --git a/crawl/urls.csv b/crawl/urls.csv
deleted file mode 100644
index 46e1afd1..00000000
--- a/crawl/urls.csv
+++ /dev/null
@@ -1 +0,0 @@
-https://developer.hashicorp.com/terraform/docs
\ No newline at end of file