diff --git a/Input Data.txt b/Input Data.txt new file mode 100644 index 0000000..f91c067 --- /dev/null +++ b/Input Data.txt @@ -0,0 +1,175 @@ +marked +https://fitgirl-repacks.site/all-my-repacks-a-z +https://fitgirl-repacks.site/ +ul +lcp_catlist +lcp_nextlink +smart_push_smio_not_allow +marked +https://gog-games.com/search/all/1/title/asc/any +https://gog-games.com +div +game-blocks grid-view +btn +null +marked +https://masquerade.site/a-z/ +https://masquerade.site +div +letter-section +null +null +marked +https://nsw2u.xyz/switch-posts +https://nsw2u.xyz/ +div +letter-section +null +null +marked +https://madloader.com/switch-nsp-games-collection +https://madloader.com/ +div +entry-inner +null +null +marked +https://nxbrew.com/list-of-games/ +https://nxbrew.com/ +div +letter-section +null +null +marked +https://www.xcinsp.com/ +https://www.xcinsp.com/ +ul +dhswp-html-sitemap-post-list dhswp-post-list +null +null +marked +https://archive.org/download/Sony-Playstation-USA-Redump.org-2019-05-27/ +https://archive.org/download/Sony-Playstation-USA-Redump.org-2019-05-27/ +table +directory-listing-table +null +null +marked +https://hexrom.com/roms/playstation/ +https://hexrom.com/ +div +col-lg-4 col-sm-6 col-xs-12 +next page-numbers +null +marked +https://hexrom.com/roms/playstation-2/ +https://hexrom.com/ +div +col-lg-4 col-sm-6 col-xs-12 +next page-numbers +null +marked +https://dlpsgame.org/list-all-game-ps2/ +https://dlpsgame.org/ +div +listing-item +null +null +marked +https://gamesmountain.com/playstation_3 +https://gamesmountain.com/ +h2 +entry-title +next page-numbers +null +marked +https://dlpsgame.org/list-all-game-ps3/ +https://dlpsgame.org/ +li +listing-item +null +null +marked +https://dlpsgame.org/list-all-game-ps4/ +https://dlpsgame.org/ +li +listing-item +null +null +marked +https://dlxbgame.net/list-all-game-xbox-iso/ +https://dlxbgame.net/ +div +listing-item +null +null +makred +https://hexrom.com/roms/microsoft-xbox/ +https://hexrom.com/ +div +col-lg-4 col-sm-6 col-xs-12 +next page-numbers +null +marked +https://gamesmountain.com/xbox_360_game +https://gamesmountain.com/ +h2 +entry-title +next page-numbers +null +marked +https://hexrom.com/roms/xbox-360/ +https://hexrom.com/ +div +col-lg-4 col-sm-6 col-xs-12 +next page-numbers +null +marked +https://nswgame.com/list-all-game-wii/ +https://nswgame.com/ +li +listing-item +null +null +marked +https://www.emulatorgames.net/roms/nintendo-wii/ +https://www.emulatorgames.net/ +ul +site-list +page-item +null +marked +https://romskingdom.com/en/download-roms/nintendo-wii +https://romskingdom.com/ +div +row row-md +next +null +marked +https://archive.org/download/mame-merged/mame-merged/ +https://archive.org/download/mame-merged/mame-merged/ +table +directory-listing-table +null +null +marked +https://ia801800.us.archive.org/view_archive.php?archive=/14/items/2020_01_06_fbn/roms/arcade.zip + +table +archext +null +null +marked +https://archive.org/download/No-Intro-Collection_2016-01-03_Fixed/ +https://archive.org/download/No-Intro-Collection_2016-01-03_Fixed/ +table +directory-listing-table +null +null +marked +https://archive.org/download/tosec-2021-02-14-to-2021-08-08-update/TOSEC%20%282021-02-14%20to%202021-08-08%29/ +https://archive.org/download/tosec-2021-02-14-to-2021-08-08-update/TOSEC%20%282021-02-14%20to%202021-08-08%29/ +table +directory-listing-table +null +null diff --git a/cleaner.py b/cleaner.py new file mode 100644 index 0000000..5a83f22 --- /dev/null +++ b/cleaner.py @@ -0,0 +1,30 @@ +import json +#loads json file +try: + json_file = open("output.json") + strings = json.load(json_file) +except: + print('Json file to be cleaned not found.') +#Single version, used best on a single key or target +#target_key = "switch-1-105518" +#target_string = "https://nsw2u.xyz/aaa-clock-switch-nsp" +#while True:#user input version +# target_key = s if (s:=input('Enter Key (leave blank to repeat previous):')) else target_string +# if target_key == 'q': +# print('Stopping and Saving'): +# break +# target_string = s if (s:=input('Enter String (leave blank to repeat previous):')) else target_string +# strings[target_key] = [s for s in strings[target_key] if s != target_string] + +#Automated version, used on multiple keys and targets +try: + targets = {'marked':['https://masquerade.site#a-z-listing-1'],'marked':['https://nsw2u.xyz/#a-z-listing-2'],'marked':['https://madloader.com/request/'],'marked':['https://nxbrew.com/#a-z-listing-1'],'marked':['https://archive.org/download/mame-merged/mame-merged/../']}#add pairs here, or structure differently up to you format: 'key2':['target1','target2'] + for target_key,target_strings in targets.items(): + strings[target_key] = [s for s in strings[target_key] if s not in target_strings] + with open("outputcleaned.json", "w", encoding="utf-8") as file:#dumps to new json file to be used in totable.py + file.write( + json.dumps(strings) + ) +except: + print('Key(s) marked for cleaning nonexistent, completed.') +import forsearch diff --git a/extension_1_38_0_0.crx b/extension_1_38_0_0.crx new file mode 100644 index 0000000..6d74a68 Binary files /dev/null and b/extension_1_38_0_0.crx differ diff --git a/forsearch.py b/forsearch.py new file mode 100644 index 0000000..c7229a7 --- /dev/null +++ b/forsearch.py @@ -0,0 +1,57 @@ +import json +import random +import string +def listToString(s): + # initialize an empty string + str1 = "" + count=0 + # traverse in the string + for ele in s: + str1 += ele + count+=1 + if count==len(s): + continue + else: + str1+=" " + # return string + return str1 +input_file = 'outputcleaned.json' # input file +output_file = 'outputsearchready.json' +# Opening JSON file +f = open(input_file) +N=10 # ID length +count= 0 +dic= {} # to store overal output +data = json.load(f) +for k in data.keys(): + key = k + lst= [] + count+=1 + for sub_k in data[k]: # access each entry + ID= ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(N)) # generating IDs of N length + j = sub_k.split("/")[-1] + if j=='': + j= sub_k.split("/")[-2] + name=None + if "-" in j: + name = j.split("-") + else: + name= j.split("_") + name = [nam.title() for nam in name ] + name = listToString(name) + # print(sub_k.replace("https://",""), j, name) + lst.append({"id":ID, "basename":name.replace("%",""),"link":sub_k.replace("https://","")}) + dic[key] = lst + # comment next two lines, if you want output for all objects. This 'IF' is just for three objects to check output + #if count==3: + # break +with open(output_file, "w") as outfile: + json.dump(dic, outfile) +json_file = open("outputsearchready.json") +file = json.load(json_file) +data = file.pop('marked') +with open("outputsearchready.json", "w", encoding="utf-8") as file:#dumps to new json file to be used in totable.py + file.write( + json.dumps(data) + ) +import sendtosearch \ No newline at end of file diff --git a/grabber.py b/grabber.py new file mode 100644 index 0000000..df5732c --- /dev/null +++ b/grabber.py @@ -0,0 +1,112 @@ +#Libraries and importing +from bs4 import BeautifulSoup +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from selenium.webdriver.common.desired_capabilities import DesiredCapabilities +from selenium.common.exceptions import TimeoutException +import random +import json +import time +import decouple +from decouple import config + +SELENIUMCLIENT = config('SELENIUMCLIENT') +#starting timer +print('starting process') +#setting up chrome settings +uc = webdriver +chrome_options = webdriver.ChromeOptions() +#chrome_options.add_argument('--headless') #remove hashtag at the start to run in headless mode, must also remove extension for this to work, not recommended +chrome_options.add_extension('extension_1_38_0_0.crx') +chrome_options.add_argument('--no-sandbox') +chrome_options.add_argument('--disable-dev-shm-usage') +chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36") +#wd = uc.Chrome(executable_path='chromedriver',options=chrome_options) #if local +wd = uc.Remote(SELENIUMCLIENT, options=chrome_options) #if for remote +json_data={} +#getting the links and setting up json +def link_container(site_name,container_tag,class_tag,html,domain): + soup=BeautifulSoup(html,'html.parser') + containers=soup.find_all(container_tag,class_=class_tag) + for container in containers: + links=container.find_all("a") + for link in links: + if(domain not in link['href']): + json_data[name].append(domain+link['href']) + else: + json_data[name].append(link['href']) + #print(link['href'])#add domain+link +#to getting the html of webpage +def request_page(url): + wd.get(url) + time.sleep(5) + return wd.page_source +#to get the next element +def return_next_ele(html,check_element,next_page): + if(check_element != "null"): + temp=[] + retries = 1 + while retries <= 1: + try: + temp = WebDriverWait(wd,10).until(EC.element_to_be_clickable((By.XPATH, '//button[@id="{}"]'.format(check_element)))) + temp.click() + break + except TimeoutException: + retries += 1 + next=[] + retries = 1 + while retries <= 2: + try: + next = WebDriverWait(wd,10).until(EC.element_to_be_clickable((By.XPATH, '//a[@class="{}"]'.format(next_page)))) + next = wd.find_elements(By.XPATH,'//a[@class="{}"]'.format(next_page))[-1] + break + except TimeoutException: + retries += 1 + # next=wd.find_element(By.XPATH,'//a[@class="lcp_nextlink"]') + return next +#getting data from input file +input_file=open('Input Data.txt','r') +name=input_file.readline().replace("\n","") +json_data[name]=[] +while (True): + url=input_file.readline().replace("\n","") + domain=input_file.readline().replace("\n","") + container=input_file.readline().replace("\n","") + clas=input_file.readline().replace("\n","") + next_link=input_file.readline().replace("\n","") + bypass=input_file.readline().replace("\n","") + #getting page and getting links for output file + html=request_page(url) + next=return_next_ele(html,bypass,next_link) + index=0 + old_url="" + current_url=wd.current_url + while(next is not None and next is not []): + if(old_url == current_url): + break + else: + old_url=current_url + index+=1 + link_container(name,container,clas,wd.page_source,domain) + time.sleep(random.randint(1,5)) + if(next == []): + break + next.click() + html=wd.page_source + next=return_next_ele(html,bypass,next_link) + current_url=wd.current_url + name=input_file.readline().replace("\n","") + if(name not in json_data): + json_data[name]=[] + if not name: break +input_file.close() +#outputting the data to Output.json +output_file=open("output.json","w") +json_string=json.dumps(json_data) +output_file.write(json_string) +output_file.close() +wd.close() +import cleaner \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4103deb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +beautifulsoup4 == 4.10.0 +selenium == 4.1.0 +undetected-chromedriver == 3.0.6 +requests-html == 0.10.0 +output == 1.0.1 +json2table +pysftp +requests +meilisearch +pysftp +python-decouple diff --git a/sendtosearch.py b/sendtosearch.py new file mode 100644 index 0000000..7be2094 --- /dev/null +++ b/sendtosearch.py @@ -0,0 +1,82 @@ +import meilisearch +import json +from json2table import convert +from bs4 import BeautifulSoup as bs +import os +import re +import pysftp as sftp +import requests +import time +from ftplib import FTP +from pathlib import Path +import ftplib +import decouple +from decouple import config + +SEARCHCLIENT = config('SEARCHCLIENT') +SEARCHAPIKEY = config('SEARCHAPIKEY') +FTPSERVER = config('FTPSERVER') +FTPUSER = config('FTPUSER') +FTPPASS = config('FTPPASS') +FTPLOCATION = config('FTPLOCATION') + + +#client = meilisearch.Client('serverlocation', 'apikey') +client = meilisearch.Client(SEARCHCLIENT, SEARCHAPIKEY) +json_file = open('outputsearchready.json') +games = json.load(json_file) +client.delete_index('games')#deletes previous index due to the way meilisearch does indexes, it adds on top of, and updating doesn't work very well, so a good ole delete and create works fine. +client.index('games').add_documents(games) +try: + json_file = open("outputcleaned.json")#json file to be used + strings = json.load(json_file)#loads json +except: + print('could not load json file.') +build_direction = "LEFT_TO_RIGHT"#table build option +table_attributes = {"style" : "width:100%", "class" : "alllinks"}#table attribute options +html = convert(strings, build_direction=build_direction, table_attributes=table_attributes)#converting to html table +#print(html)#can print if you wanna see it work lol +htmlfile = open("table.html", "w")#creates or opens table.html +htmlfile.write(html)#writes to table.html +htmlfile.close#saves table.html +try: + with open('table.html', 'r') as f2: + tablefile = f2.read() +except: + print('table file not found, rerun totable.py to make it.') + import totable +#aquire current html file +try: + url = 'https://old.rezi.one' + r = requests.get(url) + time.sleep(20) + htmlfile = open("index.html", "w")#creates index.html + htmlfile.write(r.text)#writes to index.html + htmlfile.close#saves index.html + htmlfile = open("index.html", "r") +except: print('site down or network not available; or python is having trouble writing to new index.') +#modify html +base = os.path.dirname(os.path.abspath(__file__)) +html = open(os.path.join(base, 'index.html')) +soup = bs(html, 'html.parser') +soup.table.append(tablefile) +with open("index.html", "w") as outf: + outf.write(soup.prettify(formatter=None)) +#send html through sftp +#host = "192.168.0.200" +#port = 21 +#username = "root" +#password = "Coolben11023" +#cnopts = sftp.CnOpts() +#cnopts.hostkeys = None +#with sftp.Connection(host=host, username=username, password=password, cnopts=cnopts) as sftp: + #print("Connection succesfully established ... ") +# sftp.cwd('/mnt/user/appdata/nginx/old.rezi.one/') # Switch to a remote directory +# sftp.put('index.html') +#sftp.close() +filename = "index.html" +ftp = ftplib.FTP(FTPSERVER) +ftp.login(FTPUSER, FTPPASS) +ftp.cwd(FTPLOCATION) +uploadfile= open('./index.html', 'rb') +ftp.storbinary('STOR ' + filename, uploadfile) \ No newline at end of file