Skip to content

Commit

Permalink
released v1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Wamy-Dev authored Mar 15, 2022
1 parent 60e4bd9 commit c9e9456
Show file tree
Hide file tree
Showing 7 changed files with 467 additions and 0 deletions.
175 changes: 175 additions & 0 deletions Input Data.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
marked
https://fitgirl-repacks.site/all-my-repacks-a-z
https://fitgirl-repacks.site/
ul
lcp_catlist
lcp_nextlink
smart_push_smio_not_allow
marked
https://gog-games.com/search/all/1/title/asc/any
https://gog-games.com
div
game-blocks grid-view
btn
null
marked
https://masquerade.site/a-z/
https://masquerade.site
div
letter-section
null
null
marked
https://nsw2u.xyz/switch-posts
https://nsw2u.xyz/
div
letter-section
null
null
marked
https://madloader.com/switch-nsp-games-collection
https://madloader.com/
div
entry-inner
null
null
marked
https://nxbrew.com/list-of-games/
https://nxbrew.com/
div
letter-section
null
null
marked
https://www.xcinsp.com/
https://www.xcinsp.com/
ul
dhswp-html-sitemap-post-list dhswp-post-list
null
null
marked
https://archive.org/download/Sony-Playstation-USA-Redump.org-2019-05-27/
https://archive.org/download/Sony-Playstation-USA-Redump.org-2019-05-27/
table
directory-listing-table
null
null
marked
https://hexrom.com/roms/playstation/
https://hexrom.com/
div
col-lg-4 col-sm-6 col-xs-12
next page-numbers
null
marked
https://hexrom.com/roms/playstation-2/
https://hexrom.com/
div
col-lg-4 col-sm-6 col-xs-12
next page-numbers
null
marked
https://dlpsgame.org/list-all-game-ps2/
https://dlpsgame.org/
div
listing-item
null
null
marked
https://gamesmountain.com/playstation_3
https://gamesmountain.com/
h2
entry-title
next page-numbers
null
marked
https://dlpsgame.org/list-all-game-ps3/
https://dlpsgame.org/
li
listing-item
null
null
marked
https://dlpsgame.org/list-all-game-ps4/
https://dlpsgame.org/
li
listing-item
null
null
marked
https://dlxbgame.net/list-all-game-xbox-iso/
https://dlxbgame.net/
div
listing-item
null
null
makred
https://hexrom.com/roms/microsoft-xbox/
https://hexrom.com/
div
col-lg-4 col-sm-6 col-xs-12
next page-numbers
null
marked
https://gamesmountain.com/xbox_360_game
https://gamesmountain.com/
h2
entry-title
next page-numbers
null
marked
https://hexrom.com/roms/xbox-360/
https://hexrom.com/
div
col-lg-4 col-sm-6 col-xs-12
next page-numbers
null
marked
https://nswgame.com/list-all-game-wii/
https://nswgame.com/
li
listing-item
null
null
marked
https://www.emulatorgames.net/roms/nintendo-wii/
https://www.emulatorgames.net/
ul
site-list
page-item
null
marked
https://romskingdom.com/en/download-roms/nintendo-wii
https://romskingdom.com/
div
row row-md
next
null
marked
https://archive.org/download/mame-merged/mame-merged/
https://archive.org/download/mame-merged/mame-merged/
table
directory-listing-table
null
null
marked
https://ia801800.us.archive.org/view_archive.php?archive=/14/items/2020_01_06_fbn/roms/arcade.zip

table
archext
null
null
marked
https://archive.org/download/No-Intro-Collection_2016-01-03_Fixed/
https://archive.org/download/No-Intro-Collection_2016-01-03_Fixed/
table
directory-listing-table
null
null
marked
https://archive.org/download/tosec-2021-02-14-to-2021-08-08-update/TOSEC%20%282021-02-14%20to%202021-08-08%29/
https://archive.org/download/tosec-2021-02-14-to-2021-08-08-update/TOSEC%20%282021-02-14%20to%202021-08-08%29/
table
directory-listing-table
null
null
30 changes: 30 additions & 0 deletions cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import json
#loads json file
try:
json_file = open("output.json")
strings = json.load(json_file)
except:
print('Json file to be cleaned not found.')
#Single version, used best on a single key or target
#target_key = "switch-1-105518"
#target_string = "https://nsw2u.xyz/aaa-clock-switch-nsp"
#while True:#user input version
# target_key = s if (s:=input('Enter Key (leave blank to repeat previous):')) else target_string
# if target_key == 'q':
# print('Stopping and Saving'):
# break
# target_string = s if (s:=input('Enter String (leave blank to repeat previous):')) else target_string
# strings[target_key] = [s for s in strings[target_key] if s != target_string]

#Automated version, used on multiple keys and targets
try:
targets = {'marked':['https://masquerade.site#a-z-listing-1'],'marked':['https://nsw2u.xyz/#a-z-listing-2'],'marked':['https://madloader.com/request/'],'marked':['https://nxbrew.com/#a-z-listing-1'],'marked':['https://archive.org/download/mame-merged/mame-merged/../']}#add pairs here, or structure differently up to you format: 'key2':['target1','target2']
for target_key,target_strings in targets.items():
strings[target_key] = [s for s in strings[target_key] if s not in target_strings]
with open("outputcleaned.json", "w", encoding="utf-8") as file:#dumps to new json file to be used in totable.py
file.write(
json.dumps(strings)
)
except:
print('Key(s) marked for cleaning nonexistent, completed.')
import forsearch
Binary file added extension_1_38_0_0.crx
Binary file not shown.
57 changes: 57 additions & 0 deletions forsearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import json
import random
import string
def listToString(s):
# initialize an empty string
str1 = ""
count=0
# traverse in the string
for ele in s:
str1 += ele
count+=1
if count==len(s):
continue
else:
str1+=" "
# return string
return str1
input_file = 'outputcleaned.json' # input file
output_file = 'outputsearchready.json'
# Opening JSON file
f = open(input_file)
N=10 # ID length
count= 0
dic= {} # to store overal output
data = json.load(f)
for k in data.keys():
key = k
lst= []
count+=1
for sub_k in data[k]: # access each entry
ID= ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(N)) # generating IDs of N length
j = sub_k.split("/")[-1]
if j=='':
j= sub_k.split("/")[-2]
name=None
if "-" in j:
name = j.split("-")
else:
name= j.split("_")
name = [nam.title() for nam in name ]
name = listToString(name)
# print(sub_k.replace("https://",""), j, name)
lst.append({"id":ID, "basename":name.replace("%",""),"link":sub_k.replace("https://","")})
dic[key] = lst
# comment next two lines, if you want output for all objects. This 'IF' is just for three objects to check output
#if count==3:
# break
with open(output_file, "w") as outfile:
json.dump(dic, outfile)
json_file = open("outputsearchready.json")
file = json.load(json_file)
data = file.pop('marked')
with open("outputsearchready.json", "w", encoding="utf-8") as file:#dumps to new json file to be used in totable.py
file.write(
json.dumps(data)
)
import sendtosearch
112 changes: 112 additions & 0 deletions grabber.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#Libraries and importing
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import TimeoutException
import random
import json
import time
import decouple
from decouple import config

SELENIUMCLIENT = config('SELENIUMCLIENT')
#starting timer
print('starting process')
#setting up chrome settings
uc = webdriver
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless') #remove hashtag at the start to run in headless mode, must also remove extension for this to work, not recommended
chrome_options.add_extension('extension_1_38_0_0.crx')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36")
#wd = uc.Chrome(executable_path='chromedriver',options=chrome_options) #if local
wd = uc.Remote(SELENIUMCLIENT, options=chrome_options) #if for remote
json_data={}
#getting the links and setting up json
def link_container(site_name,container_tag,class_tag,html,domain):
soup=BeautifulSoup(html,'html.parser')
containers=soup.find_all(container_tag,class_=class_tag)
for container in containers:
links=container.find_all("a")
for link in links:
if(domain not in link['href']):
json_data[name].append(domain+link['href'])
else:
json_data[name].append(link['href'])
#print(link['href'])#add domain+link
#to getting the html of webpage
def request_page(url):
wd.get(url)
time.sleep(5)
return wd.page_source
#to get the next element
def return_next_ele(html,check_element,next_page):
if(check_element != "null"):
temp=[]
retries = 1
while retries <= 1:
try:
temp = WebDriverWait(wd,10).until(EC.element_to_be_clickable((By.XPATH, '//button[@id="{}"]'.format(check_element))))
temp.click()
break
except TimeoutException:
retries += 1
next=[]
retries = 1
while retries <= 2:
try:
next = WebDriverWait(wd,10).until(EC.element_to_be_clickable((By.XPATH, '//a[@class="{}"]'.format(next_page))))
next = wd.find_elements(By.XPATH,'//a[@class="{}"]'.format(next_page))[-1]
break
except TimeoutException:
retries += 1
# next=wd.find_element(By.XPATH,'//a[@class="lcp_nextlink"]')
return next
#getting data from input file
input_file=open('Input Data.txt','r')
name=input_file.readline().replace("\n","")
json_data[name]=[]
while (True):
url=input_file.readline().replace("\n","")
domain=input_file.readline().replace("\n","")
container=input_file.readline().replace("\n","")
clas=input_file.readline().replace("\n","")
next_link=input_file.readline().replace("\n","")
bypass=input_file.readline().replace("\n","")
#getting page and getting links for output file
html=request_page(url)
next=return_next_ele(html,bypass,next_link)
index=0
old_url=""
current_url=wd.current_url
while(next is not None and next is not []):
if(old_url == current_url):
break
else:
old_url=current_url
index+=1
link_container(name,container,clas,wd.page_source,domain)
time.sleep(random.randint(1,5))
if(next == []):
break
next.click()
html=wd.page_source
next=return_next_ele(html,bypass,next_link)
current_url=wd.current_url
name=input_file.readline().replace("\n","")
if(name not in json_data):
json_data[name]=[]
if not name: break
input_file.close()
#outputting the data to Output.json
output_file=open("output.json","w")
json_string=json.dumps(json_data)
output_file.write(json_string)
output_file.close()
wd.close()
import cleaner
11 changes: 11 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
beautifulsoup4 == 4.10.0
selenium == 4.1.0
undetected-chromedriver == 3.0.6
requests-html == 0.10.0
output == 1.0.1
json2table
pysftp
requests
meilisearch
pysftp
python-decouple
Loading

0 comments on commit c9e9456

Please sign in to comment.