From b50228848c5d2c3b13f81fa7a472f117c76db465 Mon Sep 17 00:00:00 2001 From: Abolfazl Andalib <79583121+abolfazl8131@users.noreply.github.com> Date: Wed, 11 Dec 2024 17:10:01 +0330 Subject: [PATCH] fix(crawler): Re-design a crowler (#141) * feat(compose): comelete compose prompt * fix(kuber): remove lb * feat(compose): compelete compose prompt * nothing * fix(compose): totally restructre docker compose generator * fix(compose): directory builder * fix(compose): compelete compose generation allgorithm * fix(compose): edit default values for documentation * feat(compose): add union type input for networks * fix(routes): add /api to all routes * fix(installation): fix terraform installation process and model * fix(installation): create MyBash for scripts * fix(bash): edit bi/bash * fix(docker install): fix it * feat(install): add jenkins and gitlab installation * Update unit-test.yml * fix(install): fix jenkins and gitlab * fix(crawler): fix the crowler to crawl 2 aws urls --- admin-panel | 2 +- crawl/content_parser.py | 105 ------------ ...Amazon EC2 instance types - Amazon EC2.txt | 79 +++++++++ ...on EC2? - Amazon Elastic Compute Cloud.txt | 151 ++++++++++++++++++ crawl/main.py | 131 +++++---------- crawl/readme.md | 39 ----- crawl/urls.csv | 1 - 7 files changed, 272 insertions(+), 236 deletions(-) delete mode 100644 crawl/content_parser.py create mode 100644 crawl/crawled_data/Amazon EC2 instance types - Amazon EC2.txt create mode 100644 crawl/crawled_data/What is Amazon EC2? - Amazon Elastic Compute Cloud.txt delete mode 100644 crawl/readme.md delete mode 100644 crawl/urls.csv diff --git a/admin-panel b/admin-panel index 5b9c0c12..bfa06012 160000 --- a/admin-panel +++ b/admin-panel @@ -1 +1 @@ -Subproject commit 5b9c0c123018e42b185681bb955c7a8b48b6b7f8 +Subproject commit bfa06012cc943bdb1a59fde5fe235be06840005d diff --git a/crawl/content_parser.py b/crawl/content_parser.py deleted file mode 100644 index 9e03e97c..00000000 --- a/crawl/content_parser.py +++ /dev/null @@ -1,105 +0,0 @@ -import requests -from bs4 import BeautifulSoup -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry - -class WebContentParser: - def __init__(self, url): - self.url = url - self.headers = { - 'User-Agent': ( - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/50.0.2661.102 Safari/537.36' - ) - } - self.session = self._initialize_session() - self.main_response = None - self.all_page_data = [] - - def _initialize_session(self): - """Set up the session with retry strategy.""" - retry_strategy = Retry( - total=5, - backoff_factor=8, - ) - adapter = HTTPAdapter(max_retries=retry_strategy) - adapter.max_retries.respect_retry_after_header = False - - session = requests.Session() - session.mount("https://", adapter) - session.mount("http://", adapter) - return session - - def fetch_content(self): - """Fetch the main content from the URL.""" - try: - self.main_response = self.session.get( - self.url, verify=False, timeout=30, headers=self.headers - ) - print(f'URL fetched: {self.url}') - return self.main_response - except requests.RequestException as e: - print(f"Failed to fetch the URL: {e}") - return None - - def parse_content(self): - """Parse the fetched HTML content.""" - if not self.main_response: - print("No response available to parse.") - return [] - - main_soup = BeautifulSoup(self.main_response.content, 'html.parser') - datas = main_soup.find('main', {'id': 'main'}) - if not datas: - print("No 'main' element found.") - return [] - - all_tag = datas.find_all(['h1', 'h2', 'h3', 'p', 'blockquote', 'ul']) - each_title_data = {} - - for tag in all_tag: - if tag.name in ['h1', 'h2']: - if each_title_data: - self.all_page_data.append(each_title_data) - each_title_data = {} - each_title_data['metadata'] = tag.text.strip() - - elif tag.name == 'h3': - if tag.text.strip() == 'Resources': - each_title_data[tag.text.strip()] = '' - else: - if each_title_data: - self.all_page_data.append(each_title_data) - each_title_data = {} - each_title_data['metadata'] = tag.text.strip() - - elif tag.name in ['p', 'blockquote']: - num = len(each_title_data) - key = f'content {num}' - if tag.text.strip(): - each_title_data[key] = tag.text.strip() - - elif tag.name == 'ul': - text = ' '.join( - li.text.strip() - for li in tag.find_all('li', {'class': 'mdx-lists_listItem__nkqhg'}) - ) - if 'Resources' in each_title_data: - each_title_data['Resources'] = text - else: - num = len(each_title_data) - key = f'content {num}' - if text: - each_title_data[key] = text - - if each_title_data: - self.all_page_data.append(each_title_data) - - return self.all_page_data - - def get_data(self): - """Main method to fetch and parse content.""" - self.fetch_content() - return self.parse_content() - diff --git a/crawl/crawled_data/Amazon EC2 instance types - Amazon EC2.txt b/crawl/crawled_data/Amazon EC2 instance types - Amazon EC2.txt new file mode 100644 index 00000000..861c4cda --- /dev/null +++ b/crawl/crawled_data/Amazon EC2 instance types - Amazon EC2.txt @@ -0,0 +1,79 @@ +Title: Amazon EC2 instance types - Amazon EC2 + +When you launch an EC2 instance, the instance type that you specify + determines the hardware of the host computer used for your instance. Each instance type + offers different compute, memory, and storage capabilities, and is grouped in an instance + family based on these capabilities. Select an instance type based on the requirements of the + application or software that you plan to run on your instance. +Amazon EC2 dedicates some resources of the host computer, such as CPU, memory, and instance + storage, to a particular instance. Amazon EC2 shares other resources of the host computer, such + as the network and the disk subsystem, among instances. If each instance on a host computer + tries to use as much of one of these shared resources as possible, each receives an equal + share of that resource. However, when a resource is underused, an instance can consume a + higher share of that resource while it's available. +Each instance type provides higher or lower minimum performance from a shared resource. + For example, instance types with high I/O performance have a larger allocation of shared resources. + Allocating a larger share of shared resources also reduces the variance of I/O performance. + For most applications, moderate I/O performance is more than enough. However, for + applications that require greater or more consistent I/O performance, consider + an instance type with higher I/O performance. +Current generation instances +Previous generation instances +Amazon EC2 instance type naming conventions +Amazon EC2 instance type specifications +Instances built on the AWS Nitro System +Amazon EC2 instance type quotas +For the best performance, we recommend that you use the following instance types + when you launch new instances. For more information, see Amazon EC2 Instance Types. +General purpose: M5 | M5a | M5ad | M5d | M5dn | M5n | M5zn | M6a | M6g | M6gd | M6i | M6id | M6idn | M6in | M7a | M7g | M7gd | M7i | M7i-flex | M8g | Mac1 | Mac2 | Mac2-m1ultra | Mac2-m2 | Mac2-m2pro | T2 | T3 | T3a | T4g +Compute optimized: C5 | C5a | C5ad | C5d | C5n | C6a | C6g | C6gd | C6gn | C6i | C6id | C6in | C7a | C7g | C7gd | C7gn | C7i | C7i-flex | C8g +Memory optimized: R5 | R5a | R5ad | R5b | R5d | R5dn | R5n | R6a | R6g | R6gd | R6i | R6idn | R6in | R6id | R7a | R7g | R7gd | R7i | R7iz | R8g | U-3tb1 | U-6tb1 | U-9tb1 | U-12tb1 | U-18tb1 | U-24tb1 | U7i-6tb | U7i-8tb | U7i-12tb | U7in-16tb | U7in-24tb | U7in-32tb | X1 | X1e | X2gd | X2idn | X2iedn | X2iezn | X8g | z1d +Storage optimized: D2 | D3 | D3en | H1 | I3 | I3en | I4g | I4i | I7ie | I8g | Im4gn | Is4gen +Accelerated computing: DL1 | DL2q | F1 | G4ad | G4dn | G5 | G5g | G6 | G6e | Gr6 | Inf1 | Inf2 | P2 | P3 | P3dn | P4d | P4de | P5 | P5e | P5en | Trn1 | Trn1n | Trn2 | Trn2u | VT1 +High-performance computing: Hpc6a | Hpc6id | Hpc7a | Hpc7g +Amazon Web Services offers previous generation instance types for users who have optimized their + applications around them and have yet to upgrade. We encourage you to use current generation + instance types to get the best performance, but we continue to support the following previous + generation instance types. For more information about which current + generation instance type would be a suitable upgrade, see + Previous Generation Instances. +General purpose: A1 | M1 | M2 | M3 | M4 | T1 +Compute optimized: C1 | C3 | C4 +Memory optimized: R3 | R4 +Storage optimized: I2 +Accelerated computing: G3 +Fixed performance instances provide fixed CPU resources. These instances can + deliver and sustain full CPU performance at any time, and for as long as a workload + needs it. If you need consistently high CPU performance for applications such as + video encoding, high volume websites, or HPC applications, we recommend that you use + fixed performance instances. +Burstable performance (T) instances provide a baseline level of CPU + performance with the ability to burst above the baseline. The baseline CPU is + designed to meet the needs of the majority of general purpose workloads, such as + large-scale micro-services, web servers, small and medium databases, data logging, + code repositories, virtual desktops, and development and test environments. +The baseline utilization and ability to burst are governed by CPU credits. Each + burstable performance instance continuously earns credits when it stays below the CPU + baseline, and continuously spends credits when it bursts above the baseline. For more + information, see Burstable + performance instances in the Amazon EC2 User Guide. +M7i-flex and C7i-flex instances offer a balance of compute, memory, and network + resources, and they provide the most cost-effective way to run a broad spectrum of + general purpose applications. These instances provide reliable CPU resources to + deliver a baseline CPU performance of 40 percent, which is designed to meet the + compute requirements for a majority of general purpose workloads. When more + performance is needed, these instances provide the ability to exceed the baseline + CPU performance and deliver up to 100 percent CPU performance for 95 percent of the + time over a 24-hour window. +M7i-flex and C7i-flex instances running at a high CPU utilization that is consistently + above the baseline for long periods of time might see a gradual reduction in the maximum + burst CPU throughput. For more information, see M7i-flex instances and C7i-flex instances. +For pricing information, see Amazon EC2 Pricing. + Javascript is disabled or is unavailable in your browser. +To use the Amazon Web Services Documentation, Javascript must be enabled. Please refer to your browser's Help pages for instructions. +Thanks for letting us know we're doing a good job! +If you've got a moment, please tell us what we did right so we can do more of it. + +Thanks for letting us know this page needs work. We're sorry we let you down. +If you've got a moment, please tell us how we can make the documentation better. + diff --git a/crawl/crawled_data/What is Amazon EC2? - Amazon Elastic Compute Cloud.txt b/crawl/crawled_data/What is Amazon EC2? - Amazon Elastic Compute Cloud.txt new file mode 100644 index 00000000..d0e78fd3 --- /dev/null +++ b/crawl/crawled_data/What is Amazon EC2? - Amazon Elastic Compute Cloud.txt @@ -0,0 +1,151 @@ +Title: What is Amazon EC2? - Amazon Elastic Compute Cloud + +Amazon Elastic Compute Cloud (Amazon EC2) provides on-demand, scalable computing capacity in the Amazon Web + Services (AWS) Cloud. Using Amazon EC2 reduces hardware costs so you can develop and deploy + applications faster. You can use Amazon EC2 to launch as many or as few virtual servers as you + need, configure security and networking, and manage storage. You can add capacity (scale up) + to handle compute-heavy tasks, such as monthly or yearly processes, or spikes in website + traffic. When usage decreases, you can reduce capacity (scale down) again. +An EC2 instance is a virtual server in the AWS Cloud. When you launch an EC2 instance, + the instance type that you specify determines the hardware available to your instance. + Each instance type offers a different balance of compute, memory, network, and storage + resources. For more information, see the Amazon EC2 Instance Types Guide. +Amazon EC2 provides the following high-level features: +Virtual servers. +Preconfigured templates for your instances that package the components you + need for your server (including the operating system and additional + software). +Various configurations of CPU, memory, storage, networking capacity, and + graphics hardware for your instances. +Persistent storage volumes for your data using Amazon Elastic Block Store (Amazon EBS). +Storage volumes for temporary data that is deleted when you stop, + hibernate, or terminate your instance. +Secure login information for your instances. AWS stores the public key + and you store the private key in a secure place. +A virtual firewall that allows you to specify the protocols, ports, and + source IP ranges that can reach your instances, and the destination IP + ranges to which your instances can connect. +Amazon EC2 supports the processing, storage, and transmission +of credit card data by a merchant or service provider, and has been +validated as being compliant with Payment Card Industry (PCI) Data Security Standard (DSS). +For more information about PCI DSS, including how to request a copy of the AWS PCI Compliance Package, +see PCI DSS Level 1. + +You can use other AWS services with the instances that you deploy using Amazon EC2. +Helps ensure you have the correct number of Amazon EC2 instances available to + handle the load for your application. +Automate backing up your Amazon EC2 instances and the Amazon EBS volumes attached to + them. +Monitor your instances and Amazon EBS volumes. +Automatically distribute incoming application traffic across multiple + instances. +Detect potentially unauthorized or malicious use of your EC2 instances. +Automate the creation, management, and deployment of customized, secure, and + up-to-date server images. +Size, configure, and deploy AWS resources for third-party applications + without having to manually identify and provision individual AWS + resources. +Perform operations at scale on EC2 instances with this secure end-to-end + management solution. +You can launch instances using another AWS compute service instead of using Amazon EC2. +Build websites or web applications using Amazon Lightsail, a cloud platform + that provides the resources that you need to deploy your project quickly, for + a low, predictable monthly price. To compare Amazon EC2 and Lightsail, see + Amazon Lightsail or Amazon EC2. +Deploy, manage, and scale containerized applications on a cluster of EC2 + instances. For more information, see Choosing an AWS container service. +Run your Kubernetes applications on AWS. For more information, see + Choosing an AWS container service. +You can create and manage your Amazon EC2 instances using the following interfaces: +A simple web interface to create and manage Amazon EC2 instances and resources. + If you've signed up for an AWS account, you can access the Amazon EC2 console + by signing into the AWS Management Console and selecting EC2 from + the console home page. +Enables you to interact with AWS services using commands in your command-line shell. It + is supported on Windows, Mac, and Linux. For more information about the + AWS CLI , see AWS Command Line Interface User Guide. You can find the Amazon EC2 commands in the AWS CLI Command Reference. +Amazon EC2 supports creating resources using AWS CloudFormation. You create a template, in JSON or YAML + format, that describes your AWS resources, and AWS CloudFormation provisions and + configures those resources for you. You can reuse your CloudFormation + templates to provision the same resources multiple times, whether in the + same Region and account or in multiple Regions and accounts. For more + information about supported resource types and properties for Amazon EC2, see + EC2 resource type + reference in the AWS CloudFormation User Guide. +If you prefer to build applications using language-specific APIs instead + of submitting a request over HTTP or HTTPS, AWS provides libraries, sample + code, tutorials, and other resources for software developers. These + libraries provide basic functions that automate tasks such as + cryptographically signing your requests, retrying requests, and handling + error responses, making it easier for you to get started. For more + information, see + Tools to Build + on AWS. +A set of PowerShell modules that are built on the functionality exposed by + the AWS SDK for .NET. The Tools for PowerShell enable you to script operations on your AWS + resources from the PowerShell command line. To get started, see the + AWS Tools for Windows PowerShell User Guide. You can find the cmdlets for Amazon EC2, in the AWS Tools for PowerShell Cmdlet Reference. +Amazon EC2 provides a Query API. These requests are HTTP or HTTPS requests that + use the HTTP verbs GET or POST and a Query parameter named + Action. For more information about the API actions for + Amazon EC2, see Actions in the + Amazon EC2 API Reference. +Amazon EC2 provides the following pricing options: +You can get started with Amazon EC2 for free. To explore the Free Tier options, + see AWS Free Tier. +Pay for the instances that you use by the second, with a minimum of 60 + seconds, with no long-term commitments or upfront payments. +You can reduce your Amazon EC2 costs by making a commitment to a consistent + amount of usage, in USD per hour, for a term of 1 or 3 years. +You can reduce your Amazon EC2 costs by making a commitment to a specific + instance configuration, including instance type and Region, for a term of 1 + or 3 years. +Request unused EC2 instances, which can reduce your Amazon EC2 costs + significantly. +Reduce costs by using a physical EC2 server that is fully dedicated for + your use, either On-Demand or as part of a Savings Plan. You can use your + existing server-bound software licenses and get help meeting compliance + requirements. +Reserve compute capacity for your EC2 instances in a specific Availability + Zone for any duration of time. +Removes the cost of unused minutes and seconds from your bill. +For a complete list of charges and prices for Amazon EC2 and more information about the purchase + models, see Amazon EC2 pricing. +To create estimates for your AWS use cases, use the AWS Pricing Calculator. +To estimate the cost of transforming Microsoft + workloads to a modern architecture that uses open source and + cloud-native services deployed on AWS, use the AWS + Modernization Calculator for Microsoft Workloads. +To see your bill, go to the Billing and Cost Management + Dashboard in the AWS Billing and Cost Management + console. Your bill contains links to usage reports that provide details + about your bill. To learn more about AWS account billing, see AWS Billing and Cost Management User + Guide. +If you have questions concerning AWS billing, accounts, and events, contact AWS Support. +To calculate the cost of a sample provisioned + environment, see Cloud Economics + Center. When calculating the cost of a provisioned + environment, remember to include incidental costs such as snapshot storage for EBS + volumes. +You can optimize the cost, security, and performance of your AWS environment + using AWS Trusted Advisor. +You can use AWS Cost Explorer to analyze the cost and usage of your EC2 instances. You can view + data up to the last 13 months, and forecast how much you are likely to spend for the next + 12 months. For more information, see + Analyzing your costs with + AWS Cost Explorer in the AWS Cost Management User Guide. +Amazon EC2 features +AWS re:Post +AWS Skill Builder +AWS Support +Hands-on Tutorials +Web Hosting +Windows on AWS + Javascript is disabled or is unavailable in your browser. +To use the Amazon Web Services Documentation, Javascript must be enabled. Please refer to your browser's Help pages for instructions. +Thanks for letting us know we're doing a good job! +If you've got a moment, please tell us what we did right so we can do more of it. + +Thanks for letting us know this page needs work. We're sorry we let you down. +If you've got a moment, please tell us how we can make the documentation better. + diff --git a/crawl/main.py b/crawl/main.py index f86e632e..3a4621e3 100644 --- a/crawl/main.py +++ b/crawl/main.py @@ -1,92 +1,43 @@ - -import argparse -import csv -import logging import requests from bs4 import BeautifulSoup -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry -from content_parser import WebContentParser - - -def setup_logging(): - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s', - handlers=[logging.StreamHandler()] - ) - - -def setup_http_session(): - retry_strategy = Retry( - total=5, - backoff_factor=8, - ) - adapter = HTTPAdapter(max_retries=retry_strategy) - adapter.max_retries.respect_retry_after_header = False - session = requests.Session() - session.mount("https://", adapter) - session.mount("http://", adapter) - return session - - -def process_urls(file_path, save_result): - http = setup_http_session() - headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} - - with open(file_path, 'r') as file: - csv_reader = csv.reader(file) - for row in csv_reader: - if row: # Check if the row is not empty - main_url = row[0] - try: - main_response = http.get(main_url, verify=False, timeout=30, headers=headers) - logging.info(f'Fetched URL: {main_url}') - except requests.RequestException as e: - logging.error(f"Failed to fetch URL {main_url}: {e}") - continue - - main_soup = BeautifulSoup(main_response.content, 'html.parser') - products = main_soup.find('div', {'class': 'marketing-content_root__DE3hU'}).find_all('div', {'class': 'card-grid-block_root__yDdm_'}) - logging.info(f'Found {len(products)} products on page: {main_url}') - all_data = [] - for product in products: - # Get org title - title = product.find('h2').text - sub_content_link=[] - all_sub_title = product.find_all('li') - for res in all_sub_title: - sub_part_content = {} - sub_part_content['main_title'] = title - sub_title = res.find('span', {'class': 'card-title_text__F97Wj'}).get_text() - sub_part_content['sub_title'] = sub_title - sub_title_link = 'https://developer.hashicorp.com' + res.find('a').attrs['href'] - sub_part_content['sub_title_link'] = sub_title_link - - parser = WebContentParser(sub_title_link) - data = parser.get_data() - sub_part_content['all_data_info'] = data - - logging.info(f'Parsed content for sub-title: {sub_title}') - sub_content_link.append(sub_part_content) - all_data.append(sub_content_link) - if save_result: - # Logic to save sub_part_content goes here (e.g., writing to a file or database) - logging.info(f'Saving result for: {all_data}') - else: - print(all_data) - - -def main(): - setup_logging() - - parser = argparse.ArgumentParser(description='Process URLs from a CSV file.') - parser.add_argument('--csv_path', type=str, default='./urls.csv', help='Path to the CSV file containing URLs') - parser.add_argument('--save_result', type=bool, default=False, help='Flag to indicate if the results should be saved') - args = parser.parse_args() - - process_urls(args.csv_path, args.save_result) - - -if __name__ == '__main__': - main() +import os + +# List of URLs to crawl +urls = [ + "https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/concepts.html", + "https://docs.aws.amazon.com/ec2/latest/instancetypes/instance-types.html#current-gen-instances" +] + +# Directory to save the files +save_dir = "crawled_data" +os.makedirs(save_dir, exist_ok=True) + +def fetch_and_save(url): + try: + response = requests.get(url) + response.raise_for_status() # Check if the request was successful + + # Parse the HTML content + soup = BeautifulSoup(response.text, 'html.parser') + + # For demonstration, we are fetching the page title and all paragraphs + title = soup.title.string if soup.title else "no_title" + paragraphs = soup.find_all('p') + + # Prepare the file name + file_name = os.path.join(save_dir, f"{title}.txt") + + # Write the content to the file + with open(file_name, 'w', encoding='utf-8') as file: + file.write(f"Title: {title}\n\n") + for para in paragraphs: + file.write(para.get_text() + "\n") + + print(f"Saved content from {url} to {file_name}") + + except requests.RequestException as e: + print(f"Failed to fetch {url}: {e}") + +# Fetch and save data from each URL +for url in urls: + fetch_and_save(url) diff --git a/crawl/readme.md b/crawl/readme.md deleted file mode 100644 index 93e44d57..00000000 --- a/crawl/readme.md +++ /dev/null @@ -1,39 +0,0 @@ -# Documentation for Web Content Scraper - -## Overview -This script is designed to scrape data from a list of URLs provided in a CSV file. It fetches the content, extracts specific product information, and logs the operations performed. Optionally, the extracted content can also be saved. The script utilizes various libraries such as `requests`, `BeautifulSoup`, and `argparse` to ensure efficient and robust operation. - -## Prerequisites -Make sure the following Python packages are installed: -- `requests` -- `beautifulsoup4` -- `urllib3` - -To install the dependencies, run the following command: -```sh -pip install requests beautifulsoup4 -``` -## How to Use -Arguments -The script accepts command-line arguments that allow customization of behavior: ---csv_path: The path to the CSV file containing URLs to scrape. The default value is ./urls.csv. ---save_result: A boolean flag indicating whether to save the scraped results. The default value is False. -## Running the Script -You can run the script by using the following command: - -```sh -Copy code -python main.py --csv_path --save_result -``` -For example: -```sh -Copy code -python main.py --csv_path ./urls.csv --save_result True -``` -## CSV File Format -The CSV file should contain a list of URLs, with each URL on a new line. Here is an example: -``` -https://example.com/page1 -https://example.com/page2 -``` - diff --git a/crawl/urls.csv b/crawl/urls.csv deleted file mode 100644 index 46e1afd1..00000000 --- a/crawl/urls.csv +++ /dev/null @@ -1 +0,0 @@ -https://developer.hashicorp.com/terraform/docs \ No newline at end of file