Skip to content
This repository has been archived by the owner on Jan 25, 2022. It is now read-only.

Commit

Permalink
Merge branch 'release/v0.1.5'
Browse files Browse the repository at this point in the history
  • Loading branch information
taizan-hokuto committed Sep 3, 2020
2 parents f9480ea + 0456300 commit a790ab1
Show file tree
Hide file tree
Showing 27 changed files with 429 additions and 393 deletions.
2 changes: 1 addition & 1 deletion pytchat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
pytchat is a lightweight python library to browse youtube livechat without Selenium or BeautifulSoup.
"""
__copyright__ = 'Copyright (C) 2019 taizan-hokuto'
__version__ = '0.1.4'
__version__ = '0.1.5'
__license__ = 'MIT'
__author__ = 'taizan-hokuto'
__author_email__ = '55448286+taizan-hokuto@users.noreply.github.com'
Expand Down
98 changes: 62 additions & 36 deletions pytchat/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
import argparse

import os
import signal
from json.decoder import JSONDecodeError
from pathlib import Path
from pytchat.util.extract_video_id import extract_video_id
from .arguments import Arguments
from .. exceptions import InvalidVideoIdException, NoContents, VideoInfoParseException
from .progressbar import ProgressBar
from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchError
from .. processors.html_archiver import HTMLArchiver
from .. tool.extract.extractor import Extractor
from .. tool.videoinfo import VideoInfo
from .. util.extract_video_id import extract_video_id
from .. import util
from .. import __version__

'''
Expand All @@ -29,46 +34,67 @@ def main():
help='Output directory (end with "/"). default="./"', default='./')
parser.add_argument(f'--{Arguments.Name.VERSION}', action='store_true',
help='Show version')
parser.add_argument(f'--{Arguments.Name.SAVE_ERROR_DATA}', action='store_true',
help='Save error data when error occurs(".dat" file)')
Arguments(parser.parse_args().__dict__)
if Arguments().print_version:
print(f'pytchat v{__version__} © 2019 taizan-hokuto')
return

# Extractor
if Arguments().video_ids:
for video_id in Arguments().video_ids:
if '[' in video_id:
video_id = video_id.replace('[', '').replace(']', '')
try:
video_id = extract_video_id(video_id)
if os.path.exists(Arguments().output):
path = Path(Arguments().output + video_id + '.html')
else:
raise FileNotFoundError
info = VideoInfo(video_id)
print(f"Extracting...\n"
f" video_id: {video_id}\n"
f" channel: {info.get_channel_name()}\n"
f" title: {info.get_title()}")

print(f" output path: {path.resolve()}")
Extractor(video_id,
processor=HTMLArchiver(
Arguments().output + video_id + '.html'),
callback=_disp_progress
).extract()
print("\nExtraction end.\n")
except InvalidVideoIdException:
print("Invalid Video ID or URL:", video_id)
except (TypeError, NoContents) as e:
print(e)
except FileNotFoundError:
print("The specified directory does not exist.:{}".format(Arguments().output))
except VideoInfoParseException:
print("Cannot parse video information.:{}".format(video_id))
if not Arguments().video_ids:
parser.print_help()
return
parser.print_help()
for video_id in Arguments().video_ids:
if '[' in video_id:
video_id = video_id.replace('[', '').replace(']', '')
try:
video_id = extract_video_id(video_id)
if os.path.exists(Arguments().output):
path = Path(Arguments().output + video_id + '.html')
else:
raise FileNotFoundError
info = VideoInfo(video_id)
print(f"Extracting...\n"
f" video_id: {video_id}\n"
f" channel: {info.get_channel_name()}\n"
f" title: {info.get_title()}")

print(f" output path: {path.resolve()}")
duration = info.get_duration()
pbar = ProgressBar(duration)
ex = Extractor(video_id,
processor=HTMLArchiver(Arguments().output + video_id + '.html'),
callback=pbar._disp,
div=10)
signal.signal(signal.SIGINT, (lambda a, b: cancel(ex, pbar)))
ex.extract()
pbar.close()
if pbar.is_cancelled():
print("\nThe extraction process has been discontinued.\n")
return
print("\nThe extraction process has been completed.\n")
except InvalidVideoIdException:
print("Invalid Video ID or URL:", video_id)
except (TypeError, NoContents) as e:

print(e.with_traceback())
except FileNotFoundError:
print("The specified directory does not exist.:{}".format(Arguments().output))
except JSONDecodeError as e:
print(e.msg)
print("Cannot parse video information.:{}".format(video_id))
if Arguments().save_error_data:
util.save(e.doc, "ERR_JSON_DECODE", ".dat")
except PatternUnmatchError as e:
print(e.msg)
print("Cannot parse video information.:{}".format(video_id))
if Arguments().save_error_data:
util.save(e.doc, "ERR_PATTERN_UNMATCH", ".dat")

return


def _disp_progress(a, b):
print('.', end="", flush=True)
def cancel(ex: Extractor, pbar: ProgressBar):
ex.cancel()
pbar.cancel()
5 changes: 2 additions & 3 deletions pytchat/cli/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class Name:
VERSION: str = 'version'
OUTPUT: str = 'output_dir'
VIDEO_IDS: str = 'video_id'
SAVE_ERROR_DATA: bool = 'save_error_data'

def __init__(self,
arguments: Optional[Dict[str, Union[str, bool, int]]] = None):
Expand All @@ -34,10 +35,8 @@ def __init__(self,
self.print_version: bool = arguments[Arguments.Name.VERSION]
self.output: str = arguments[Arguments.Name.OUTPUT]
self.video_ids: List[int] = []
self.save_error_data: bool = arguments[Arguments.Name.SAVE_ERROR_DATA]
# Videos
if arguments[Arguments.Name.VIDEO_IDS]:
self.video_ids = [video_id
for video_id in arguments[Arguments.Name.VIDEO_IDS].split(',')]



41 changes: 41 additions & 0 deletions pytchat/cli/progressbar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
'''
This code for this progress bar is based on
vladignatyev/progress.py
https://gist.github.com/vladignatyev/06860ec2040cb497f0f3
(MIT License)
'''
import sys


class ProgressBar:
def __init__(self, duration):
self._duration = duration
self._count = 0
self._bar_len = 60
self._cancelled = False

def _disp(self, _, fetched):
self._progress(fetched / 1000, self._duration)

def _progress(self, fillin, total, status=''):
if total == 0 or self._cancelled:
return
self._count += fillin
filled_len = int(round(self._bar_len * self._count / float(total)))
percents = round(100.0 * self._count / float(total), 1)
if filled_len > self._bar_len:
filled_len = self._bar_len
percents = 100
bar = '=' * filled_len + ' ' * (self._bar_len - filled_len)
sys.stdout.write(' [%s] %s%s ...%s\r' % (bar, percents, '%', status))
sys.stdout.flush()

def close(self):
if not self._cancelled:
self._progress(self._duration, self._duration)

def cancel(self):
self._cancelled = True

def is_cancelled(self):
return self._cancelled
3 changes: 2 additions & 1 deletion pytchat/config/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import logging
from . import mylogger
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'}
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
}


def logger(module_name: str, loglevel=None):
Expand Down
37 changes: 18 additions & 19 deletions pytchat/core_async/livechat.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import aiohttp

import asyncio
import httpx
import json
import signal
import time
import traceback
import urllib.parse
from aiohttp.client_exceptions import ClientConnectorError
from concurrent.futures import CancelledError
from asyncio import Queue
from concurrent.futures import CancelledError
from .buffer import Buffer
from ..parser.live import Parser
from .. import config
Expand All @@ -22,7 +22,7 @@


class LiveChatAsync:
'''asyncio(aiohttp)を利用してYouTubeのライブ配信のチャットデータを取得する
'''asyncioを利用してYouTubeのライブ配信のチャットデータを取得する
Parameter
---------
Expand Down Expand Up @@ -161,11 +161,11 @@ async def _listen(self, continuation):
parameter for next chat data
'''
try:
async with aiohttp.ClientSession() as session:
async with httpx.AsyncClient(http2=True) as client:
while(continuation and self._is_alive):
continuation = await self._check_pause(continuation)
contents = await self._get_contents(
continuation, session, headers)
continuation, client, headers)
metadata, chatdata = self._parser.parse(contents)

timeout = metadata['timeoutMs'] / 1000
Expand Down Expand Up @@ -210,7 +210,7 @@ async def _check_pause(self, continuation):
self._video_id, 3, self._topchat_only)
return continuation

async def _get_contents(self, continuation, session, headers):
async def _get_contents(self, continuation, client, headers):
'''Get 'continuationContents' from livechat json.
If contents is None at first fetching,
try to fetch archive chat data.
Expand All @@ -219,7 +219,7 @@ async def _get_contents(self, continuation, session, headers):
-------
'continuationContents' which includes metadata & chatdata.
'''
livechat_json = await self._get_livechat_json(continuation, session, headers)
livechat_json = await self._get_livechat_json(continuation, client, headers)
contents = self._parser.get_contents(livechat_json)
if self._first_fetch:
if contents is None or self._is_replay:
Expand All @@ -229,18 +229,18 @@ async def _get_contents(self, continuation, session, headers):
continuation = arcparam.getparam(
self._video_id, self.seektime, self._topchat_only)
livechat_json = (await self._get_livechat_json(
continuation, session, headers))
continuation, client, headers))
reload_continuation = self._parser.reload_continuation(
self._parser.get_contents(livechat_json))
if reload_continuation:
livechat_json = (await self._get_livechat_json(
reload_continuation, session, headers))
reload_continuation, client, headers))
contents = self._parser.get_contents(livechat_json)
self._is_replay = True
self._first_fetch = False
return contents

async def _get_livechat_json(self, continuation, session, headers):
async def _get_livechat_json(self, continuation, client, headers):
'''
Get json which includes chat data.
'''
Expand All @@ -249,14 +249,13 @@ async def _get_livechat_json(self, continuation, session, headers):
status_code = 0
url = f"https://www.youtube.com/{self._fetch_url}{continuation}&pbj=1"
for _ in range(MAX_RETRY + 1):
async with session.get(url, headers=headers) as resp:
try:
text = await resp.text()
livechat_json = json.loads(text)
break
except (ClientConnectorError, json.JSONDecodeError):
await asyncio.sleep(1)
continue
try:
resp = await client.get(url, headers=headers)
livechat_json = resp.json()
break
except (httpx.HTTPError, json.JSONDecodeError):
await asyncio.sleep(1)
continue
else:
self._logger.error(f"[{self._video_id}]"
f"Exceeded retry count. status_code={status_code}")
Expand Down
21 changes: 10 additions & 11 deletions pytchat/core_multithread/livechat.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import requests
import httpx
import json
import signal
import time
Expand Down Expand Up @@ -153,10 +153,10 @@ def _listen(self, continuation):
parameter for next chat data
'''
try:
with requests.Session() as session:
with httpx.Client(http2=True) as client:
while(continuation and self._is_alive):
continuation = self._check_pause(continuation)
contents = self._get_contents(continuation, session, headers)
contents = self._get_contents(continuation, client, headers)
metadata, chatdata = self._parser.parse(contents)
timeout = metadata['timeoutMs'] / 1000
chat_component = {
Expand Down Expand Up @@ -199,7 +199,7 @@ def _check_pause(self, continuation):
continuation = liveparam.getparam(self._video_id, 3)
return continuation

def _get_contents(self, continuation, session, headers):
def _get_contents(self, continuation, client, headers):
'''Get 'continuationContents' from livechat json.
If contents is None at first fetching,
try to fetch archive chat data.
Expand All @@ -209,7 +209,7 @@ def _get_contents(self, continuation, session, headers):
'continuationContents' which includes metadata & chat data.
'''
livechat_json = (
self._get_livechat_json(continuation, session, headers)
self._get_livechat_json(continuation, client, headers)
)
contents = self._parser.get_contents(livechat_json)
if self._first_fetch:
Expand All @@ -219,18 +219,18 @@ def _get_contents(self, continuation, session, headers):
self._fetch_url = "live_chat_replay/get_live_chat_replay?continuation="
continuation = arcparam.getparam(
self._video_id, self.seektime, self._topchat_only)
livechat_json = (self._get_livechat_json(continuation, session, headers))
livechat_json = (self._get_livechat_json(continuation, client, headers))
reload_continuation = self._parser.reload_continuation(
self._parser.get_contents(livechat_json))
if reload_continuation:
livechat_json = (self._get_livechat_json(
reload_continuation, session, headers))
reload_continuation, client, headers))
contents = self._parser.get_contents(livechat_json)
self._is_replay = True
self._first_fetch = False
return contents

def _get_livechat_json(self, continuation, session, headers):
def _get_livechat_json(self, continuation, client, headers):
'''
Get json which includes chat data.
'''
Expand All @@ -239,10 +239,9 @@ def _get_livechat_json(self, continuation, session, headers):
status_code = 0
url = f"https://www.youtube.com/{self._fetch_url}{continuation}&pbj=1"
for _ in range(MAX_RETRY + 1):
with session.get(url, headers=headers) as resp:
with client:
try:
text = resp.text
livechat_json = json.loads(text)
livechat_json = client.get(url, headers=headers).json()
break
except json.JSONDecodeError:
time.sleep(1)
Expand Down
11 changes: 10 additions & 1 deletion pytchat/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,16 @@ class FailedExtractContinuation(ChatDataFinished):
pass


class VideoInfoParseException(Exception):
class VideoInfoParseError(Exception):
'''
thrown when failed to parse video info
'''


class PatternUnmatchError(VideoInfoParseError):
'''
thrown when failed to parse video info with unmatched pattern
'''
def __init__(self, doc):
self.msg = "PatternUnmatchError"
self.doc = doc
Loading

0 comments on commit a790ab1

Please sign in to comment.