Merge branch 'release/v0.1.5'

taizan-hokuto · Sep 3, 2020 · a790ab1 · a790ab1
2 parents f9480ea + 0456300
commit a790ab1
Show file tree

Hide file tree

Showing 27 changed files with 429 additions and 393 deletions.
diff --git a/pytchat/__init__.py b/pytchat/__init__.py
@@ -2,7 +2,7 @@
 pytchat is a lightweight python library to browse youtube livechat without Selenium or BeautifulSoup.
 """
 __copyright__    = 'Copyright (C) 2019 taizan-hokuto'
-__version__      = '0.1.4'
+__version__      = '0.1.5'
 __license__      = 'MIT'
 __author__       = 'taizan-hokuto'
 __author_email__ = '55448286+taizan-hokuto@users.noreply.github.com'

diff --git a/pytchat/cli/__init__.py b/pytchat/cli/__init__.py
@@ -1,12 +1,17 @@
 import argparse
+
 import os
+import signal
+from json.decoder import JSONDecodeError
 from pathlib import Path
-from pytchat.util.extract_video_id import extract_video_id
 from .arguments import Arguments
-from .. exceptions import InvalidVideoIdException, NoContents, VideoInfoParseException
+from .progressbar import ProgressBar
+from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchError
 from .. processors.html_archiver import HTMLArchiver
 from .. tool.extract.extractor import Extractor
 from .. tool.videoinfo import VideoInfo
+from .. util.extract_video_id import extract_video_id
+from .. import util
 from .. import __version__
 
 '''
@@ -29,46 +34,67 @@ def main():
                         help='Output directory (end with "/"). default="./"', default='./')
     parser.add_argument(f'--{Arguments.Name.VERSION}', action='store_true',
                         help='Show version')
+    parser.add_argument(f'--{Arguments.Name.SAVE_ERROR_DATA}', action='store_true',
+                        help='Save error data when error occurs(".dat" file)')
     Arguments(parser.parse_args().__dict__)
     if Arguments().print_version:
         print(f'pytchat v{__version__}     © 2019 taizan-hokuto')
         return
 
     # Extractor
-    if Arguments().video_ids:
-        for video_id in Arguments().video_ids:
-            if '[' in video_id:
-                video_id = video_id.replace('[', '').replace(']', '')
-            try:
-                video_id = extract_video_id(video_id)
-                if os.path.exists(Arguments().output):
-                    path = Path(Arguments().output + video_id + '.html')
-                else:
-                    raise FileNotFoundError
-                info = VideoInfo(video_id)
-                print(f"Extracting...\n"
-                      f" video_id: {video_id}\n"
-                      f" channel:  {info.get_channel_name()}\n"
-                      f" title:    {info.get_title()}")
-
-                print(f" output path: {path.resolve()}")
-                Extractor(video_id,
-                          processor=HTMLArchiver(
-                              Arguments().output + video_id + '.html'),
-                          callback=_disp_progress
-                          ).extract()
-                print("\nExtraction end.\n")
-            except InvalidVideoIdException:
-                print("Invalid Video ID or URL:", video_id)
-            except (TypeError, NoContents) as e:
-                print(e)
-            except FileNotFoundError:
-                print("The specified directory does not exist.:{}".format(Arguments().output))
-            except VideoInfoParseException:
-                print("Cannot parse video information.:{}".format(video_id))
+    if not Arguments().video_ids:
+        parser.print_help()
         return
-    parser.print_help()
+    for video_id in Arguments().video_ids:
+        if '[' in video_id:
+            video_id = video_id.replace('[', '').replace(']', '')
+        try:
+            video_id = extract_video_id(video_id)
+            if os.path.exists(Arguments().output):
+                path = Path(Arguments().output + video_id + '.html')
+            else:
+                raise FileNotFoundError
+            info = VideoInfo(video_id)
+            print(f"Extracting...\n"
+                  f" video_id: {video_id}\n"
+                  f" channel:  {info.get_channel_name()}\n"
+                  f" title:    {info.get_title()}")
+
+            print(f" output path: {path.resolve()}")
+            duration = info.get_duration()
+            pbar = ProgressBar(duration)
+            ex = Extractor(video_id,
+                    processor=HTMLArchiver(Arguments().output + video_id + '.html'),
+                    callback=pbar._disp,
+                    div=10)
+            signal.signal(signal.SIGINT, (lambda a, b: cancel(ex, pbar)))
+            ex.extract()
+            pbar.close()
+            if pbar.is_cancelled():
+                print("\nThe extraction process has been discontinued.\n")
+                return
+            print("\nThe extraction process has been completed.\n")
+        except InvalidVideoIdException:
+            print("Invalid Video ID or URL:", video_id)
+        except (TypeError, NoContents) as e:
+
+            print(e.with_traceback())
+        except FileNotFoundError:
+            print("The specified directory does not exist.:{}".format(Arguments().output))
+        except JSONDecodeError as e:
+            print(e.msg)
+            print("Cannot parse video information.:{}".format(video_id))
+            if Arguments().save_error_data:
+                util.save(e.doc, "ERR_JSON_DECODE", ".dat")
+        except PatternUnmatchError as e:
+            print(e.msg)
+            print("Cannot parse video information.:{}".format(video_id))
+            if Arguments().save_error_data:
+                util.save(e.doc, "ERR_PATTERN_UNMATCH", ".dat")
+
+    return
 
 
-def _disp_progress(a, b):
-    print('.', end="", flush=True)
+def cancel(ex: Extractor, pbar: ProgressBar):
+    ex.cancel()
+    pbar.cancel()
diff --git a/pytchat/cli/arguments.py b/pytchat/cli/arguments.py
@@ -18,6 +18,7 @@ class Name:
         VERSION: str = 'version'
         OUTPUT: str = 'output_dir'
         VIDEO_IDS: str = 'video_id'
+        SAVE_ERROR_DATA: bool = 'save_error_data'
 
     def __init__(self,
                  arguments: Optional[Dict[str, Union[str, bool, int]]] = None):
@@ -34,10 +35,8 @@ def __init__(self,
         self.print_version: bool = arguments[Arguments.Name.VERSION]
         self.output: str = arguments[Arguments.Name.OUTPUT]
         self.video_ids: List[int] = []
+        self.save_error_data: bool = arguments[Arguments.Name.SAVE_ERROR_DATA]
         # Videos
         if arguments[Arguments.Name.VIDEO_IDS]:
             self.video_ids = [video_id
                               for video_id in arguments[Arguments.Name.VIDEO_IDS].split(',')]
-
-
-
diff --git a/pytchat/cli/progressbar.py b/pytchat/cli/progressbar.py
@@ -0,0 +1,41 @@
+'''
+This code for this progress bar is based on
+vladignatyev/progress.py
+https://gist.github.com/vladignatyev/06860ec2040cb497f0f3
+(MIT License)
+'''
+import sys
+
+
+class ProgressBar:
+    def __init__(self, duration):
+        self._duration = duration
+        self._count = 0
+        self._bar_len = 60
+        self._cancelled = False
+
+    def _disp(self, _, fetched):
+        self._progress(fetched / 1000, self._duration)
+
+    def _progress(self, fillin, total, status=''):
+        if total == 0 or self._cancelled:
+            return
+        self._count += fillin
+        filled_len = int(round(self._bar_len * self._count / float(total)))
+        percents = round(100.0 * self._count / float(total), 1)
+        if filled_len > self._bar_len:
+            filled_len = self._bar_len
+            percents = 100
+        bar = '=' * filled_len + ' ' * (self._bar_len - filled_len)
+        sys.stdout.write(' [%s] %s%s ...%s\r' % (bar, percents, '%', status))
+        sys.stdout.flush()
+
+    def close(self):
+        if not self._cancelled:
+            self._progress(self._duration, self._duration)
+
+    def cancel(self):
+        self._cancelled = True
+
+    def is_cancelled(self):
+        return self._cancelled
diff --git a/pytchat/config/__init__.py b/pytchat/config/__init__.py
@@ -1,7 +1,8 @@
 import logging
 from . import mylogger
 headers = {
-    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'}
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
+}
 
 
 def logger(module_name: str, loglevel=None):

diff --git a/pytchat/core_async/livechat.py b/pytchat/core_async/livechat.py
@@ -1,13 +1,13 @@
-import aiohttp
+
 import asyncio
+import httpx
 import json
 import signal
 import time
 import traceback
 import urllib.parse
-from aiohttp.client_exceptions import ClientConnectorError
-from concurrent.futures import CancelledError
 from asyncio import Queue
+from concurrent.futures import CancelledError
 from .buffer import Buffer
 from ..parser.live import Parser
 from .. import config
@@ -22,7 +22,7 @@
 
 
 class LiveChatAsync:
-    '''asyncio(aiohttp)を利用してYouTubeのライブ配信のチャットデータを取得する。
+    '''asyncioを利用してYouTubeのライブ配信のチャットデータを取得する。
 
     Parameter
     ---------
@@ -161,11 +161,11 @@ async def _listen(self, continuation):
             parameter for next chat data
         '''
         try:
-            async with aiohttp.ClientSession() as session:
+            async with httpx.AsyncClient(http2=True) as client:
                 while(continuation and self._is_alive):
                     continuation = await self._check_pause(continuation)
                     contents = await self._get_contents(
-                        continuation, session, headers)
+                        continuation, client, headers)
                     metadata, chatdata = self._parser.parse(contents)
 
                     timeout = metadata['timeoutMs'] / 1000
@@ -210,7 +210,7 @@ async def _check_pause(self, continuation):
                     self._video_id, 3, self._topchat_only)
         return continuation
 
-    async def _get_contents(self, continuation, session, headers):
+    async def _get_contents(self, continuation, client, headers):
         '''Get 'continuationContents' from livechat json.
            If contents is None at first fetching,
            try to fetch archive chat data.
@@ -219,7 +219,7 @@ async def _get_contents(self, continuation, session, headers):
           -------
             'continuationContents' which includes metadata & chatdata.
         '''
-        livechat_json = await self._get_livechat_json(continuation, session, headers)
+        livechat_json = await self._get_livechat_json(continuation, client, headers)
         contents = self._parser.get_contents(livechat_json)
         if self._first_fetch:
             if contents is None or self._is_replay:
@@ -229,18 +229,18 @@ async def _get_contents(self, continuation, session, headers):
                 continuation = arcparam.getparam(
                     self._video_id, self.seektime, self._topchat_only)
                 livechat_json = (await self._get_livechat_json(
-                                 continuation, session, headers))
+                                 continuation, client, headers))
                 reload_continuation = self._parser.reload_continuation(
                     self._parser.get_contents(livechat_json))
                 if reload_continuation:
                     livechat_json = (await self._get_livechat_json(
-                        reload_continuation, session, headers))
+                        reload_continuation, client, headers))
                 contents = self._parser.get_contents(livechat_json)
                 self._is_replay = True
             self._first_fetch = False
         return contents
 
-    async def _get_livechat_json(self, continuation, session, headers):
+    async def _get_livechat_json(self, continuation, client, headers):
         '''
         Get json which includes chat data.
         '''
@@ -249,14 +249,13 @@ async def _get_livechat_json(self, continuation, session, headers):
         status_code = 0
         url = f"https://www.youtube.com/{self._fetch_url}{continuation}&pbj=1"
         for _ in range(MAX_RETRY + 1):
-            async with session.get(url, headers=headers) as resp:
-                try:
-                    text = await resp.text()
-                    livechat_json = json.loads(text)
-                    break
-                except (ClientConnectorError, json.JSONDecodeError):
-                    await asyncio.sleep(1)
-                    continue
+            try:
+                resp = await client.get(url, headers=headers)
+                livechat_json = resp.json()
+                break
+            except (httpx.HTTPError, json.JSONDecodeError):
+                await asyncio.sleep(1)
+                continue
         else:
             self._logger.error(f"[{self._video_id}]"
                                f"Exceeded retry count. status_code={status_code}")

diff --git a/pytchat/core_multithread/livechat.py b/pytchat/core_multithread/livechat.py
@@ -1,4 +1,4 @@
-import requests
+import httpx
 import json
 import signal
 import time
@@ -153,10 +153,10 @@ def _listen(self, continuation):
             parameter for next chat data
         '''
         try:
-            with requests.Session() as session:
+            with httpx.Client(http2=True) as client:
                 while(continuation and self._is_alive):
                     continuation = self._check_pause(continuation)
-                    contents = self._get_contents(continuation, session, headers)
+                    contents = self._get_contents(continuation, client, headers)
                     metadata, chatdata = self._parser.parse(contents)
                     timeout = metadata['timeoutMs'] / 1000
                     chat_component = {
@@ -199,7 +199,7 @@ def _check_pause(self, continuation):
                 continuation = liveparam.getparam(self._video_id, 3)
         return continuation
 
-    def _get_contents(self, continuation, session, headers):
+    def _get_contents(self, continuation, client, headers):
         '''Get 'continuationContents' from livechat json.
            If contents is None at first fetching,
            try to fetch archive chat data.
@@ -209,7 +209,7 @@ def _get_contents(self, continuation, session, headers):
             'continuationContents' which includes metadata & chat data.
         '''
         livechat_json = (
-            self._get_livechat_json(continuation, session, headers)
+            self._get_livechat_json(continuation, client, headers)
         )
         contents = self._parser.get_contents(livechat_json)
         if self._first_fetch:
@@ -219,18 +219,18 @@ def _get_contents(self, continuation, session, headers):
                 self._fetch_url = "live_chat_replay/get_live_chat_replay?continuation="
                 continuation = arcparam.getparam(
                     self._video_id, self.seektime, self._topchat_only)
-                livechat_json = (self._get_livechat_json(continuation, session, headers))
+                livechat_json = (self._get_livechat_json(continuation, client, headers))
                 reload_continuation = self._parser.reload_continuation(
                     self._parser.get_contents(livechat_json))
                 if reload_continuation:
                     livechat_json = (self._get_livechat_json(
-                        reload_continuation, session, headers))
+                        reload_continuation, client, headers))
                 contents = self._parser.get_contents(livechat_json)
                 self._is_replay = True
             self._first_fetch = False
         return contents
 
-    def _get_livechat_json(self, continuation, session, headers):
+    def _get_livechat_json(self, continuation, client, headers):
         '''
         Get json which includes chat data.
         '''
@@ -239,10 +239,9 @@ def _get_livechat_json(self, continuation, session, headers):
         status_code = 0
         url = f"https://www.youtube.com/{self._fetch_url}{continuation}&pbj=1"
         for _ in range(MAX_RETRY + 1):
-            with session.get(url, headers=headers) as resp:
+            with client:
                 try:
-                    text = resp.text
-                    livechat_json = json.loads(text)
+                    livechat_json = client.get(url, headers=headers).json()
                     break
                 except json.JSONDecodeError:
                     time.sleep(1)

diff --git a/pytchat/exceptions.py b/pytchat/exceptions.py
@@ -64,7 +64,16 @@ class FailedExtractContinuation(ChatDataFinished):
     pass
 
 
-class VideoInfoParseException(Exception):
+class VideoInfoParseError(Exception):
     '''
     thrown when failed to parse video info
     '''
+
+
+class PatternUnmatchError(VideoInfoParseError):
+    '''
+    thrown when failed to parse video info with unmatched pattern
+    '''
+    def __init__(self, doc):
+        self.msg = "PatternUnmatchError"
+        self.doc = doc