Skip to content

Commit

Permalink
fix(scrap-youtube): bypass consent screen (#133)
Browse files Browse the repository at this point in the history
  • Loading branch information
moshfeu authored Apr 4, 2024
1 parent 30b01da commit 38ee53b
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 28 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
"fix-path": "^2.1.0",
"fluent-ffmpeg": "2.1.2",
"js-video-url-parser": "^0.2.8",
"miniget": "^4.2.3",
"mkdirp": "^0.5.1",
"mobx": "^5.8.0",
"mobx-react": "^5.4.3",
Expand Down
73 changes: 45 additions & 28 deletions src/services/playlist-scraper.ts
Original file line number Diff line number Diff line change
@@ -1,48 +1,65 @@
import * as cheerio from 'cheerio';
import * as miniget from 'miniget';

const fetchWithMiniget = async (url: string) => {
return miniget(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
},
}).text();
}

const getPlaylistPageContent = async (playlistURL: string) => {
const playlistId = new URL(playlistURL).searchParams.get('list');
const response = await fetch(`https://www.youtube.com/playlist?list=${playlistId}`);
if (!response.ok) {
switch (response.status) {
try {
const playlistId = new URL(playlistURL).searchParams.get('list');
const response = await fetchWithMiniget(`https://www.youtube.com/playlist?list=${playlistId}`);
return response;
} catch (error /*: Miniget.MinigetError */) {
console.log('failed to fetch playlist page', error);
switch (error.statusCode) {
case 403:
throw new Error('Playlist is private or not accessible to the app');
case 400:
case 404:
throw new Error('Invalid playlist URL');
case 404:
throw new Error('Playlist not found');
default:
throw new Error('Failed to fetch playlist page');
}
}
return response.text();
}

export const scrap = async (playlistURL: string) => {
const data = await getPlaylistPageContent(playlistURL);

const $ = cheerio.load(data);
const ytInitialData = $('script').filter((_index, tag) => {
const html = cheerio.html(tag);
return html.includes('ytInitialData');
})[0];
try {
const $ = cheerio.load(data);
const ytInitialData = $('script').filter((_index, tag) => {
const html = cheerio.html(tag);
return html.includes('ytInitialData');
})[0];

const [jsonStr] = /{.*}/gm.exec(cheerio.html(ytInitialData));
const info = JSON.parse(jsonStr);
const {
contents,
} = info.contents.twoColumnBrowseResultsRenderer.tabs[0].tabRenderer.content.sectionListRenderer.contents[0].itemSectionRenderer.contents[0].playlistVideoListRenderer;
const songs = contents
.filter(({ playlistVideoRenderer }) => playlistVideoRenderer)
.map(({ playlistVideoRenderer }) => ({
id: playlistVideoRenderer.videoId,
name: playlistVideoRenderer.title.runs[0].text,
}));
const [jsonStr] = /{.*}/gm.exec(cheerio.html(ytInitialData));
const info = JSON.parse(jsonStr);
const {
contents,
} = info.contents.twoColumnBrowseResultsRenderer.tabs[0].tabRenderer.content.sectionListRenderer.contents[0].itemSectionRenderer.contents[0].playlistVideoListRenderer;
const songs = contents
.filter(({ playlistVideoRenderer }) => playlistVideoRenderer)
.map(({ playlistVideoRenderer }) => ({
id: playlistVideoRenderer.videoId,
name: playlistVideoRenderer.title.runs[0].text,
}));

const hasMore = contents.length > songs.length
const hasMore = contents.length > songs.length

return {
name: info.metadata.playlistMetadataRenderer.title,
playlist: songs,
hasMore,
};
return {
name: info.metadata.playlistMetadataRenderer.title,
playlist: songs,
hasMore,
};
} catch (err) {
console.error(err);
throw new Error('The playlist may be private, or the URL is invalid');
}
};
5 changes: 5 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -5398,6 +5398,11 @@ miniget@^4.2.2:
resolved "https://registry.yarnpkg.com/miniget/-/miniget-4.2.2.tgz#db20320f265efdc4c1826a0be431d56753074475"
integrity sha512-a7voNL1N5lDMxvTMExOkg+Fq89jM2vY8pAi9ZEWzZtfNmdfP6RXkvUtFnCAXoCv2T9k1v/fUJVaAEuepGcvLYA==

miniget@^4.2.3:
version "4.2.3"
resolved "https://registry.yarnpkg.com/miniget/-/miniget-4.2.3.tgz#3707a24c7c11c25d359473291638ab28aab349bd"
integrity sha512-SjbDPDICJ1zT+ZvQwK0hUcRY4wxlhhNpHL9nJOB2MEAXRGagTljsO8MEDzQMTFf0Q8g4QNi8P9lEm/g7e+qgzA==

minimatch@3.0.4:
version "3.0.4"
resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.0.4.tgz#5166e286457f03306064be5497e8dbb0c3d32083"
Expand Down

0 comments on commit 38ee53b

Please sign in to comment.