From 7ef3f063d29e62573303b36e77bc2f1c0fb782cc Mon Sep 17 00:00:00 2001 From: TiynGER Date: Wed, 10 Jun 2020 00:24:01 +0200 Subject: [PATCH] log: adding basic log with levels and file The log has options for several levels that can be set from the command line. The file is hardcoded as log.txt and can be toggled --- README.md | 5 +-- src/.gitignore | 1 + src/crawler.py | 54 ++++++++++++++++++++++++++++++ src/mysql.py | 34 +++++++++---------- src/run.py | 60 ++++++++++++++++++++++------------ src/studip.py | 89 +++++++++++--------------------------------------- 6 files changed, 134 insertions(+), 109 deletions(-) create mode 100644 src/crawler.py diff --git a/README.md b/README.md index c0820ed..b02d348 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,10 @@ If you run the program again it only downloads files that have changed since the - [x] Possible reset of download date - [x] Incremental file download - [x] Store id and chdate of downloaded files -- [ ] Logging +- [x] Logging - [x] Console log - - [ ] Log file + - [x] Log file + - [x] Specify log level ## Installation diff --git a/src/.gitignore b/src/.gitignore index a00feb9..89992eb 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -1,2 +1,3 @@ __pycache__ data +log.txt diff --git a/src/crawler.py b/src/crawler.py new file mode 100644 index 0000000..2d87ce2 --- /dev/null +++ b/src/crawler.py @@ -0,0 +1,54 @@ +import os +import logging as log + +from studip import Studip + + +class Crawler: + + def __init__(self, studip): + self.studip = studip + + def download_folder(self, folder): + docs = self.studip.get_docs(folder) + for doc in docs: + log.info('found doc ' + doc) + self.studip.download(doc) + + def download_folder_rec(self, folder, base_dir): + log.info('crawling folder ' + folder) + self.create_dir(base_dir) + self.download_folder(folder) + subdirs = self.studip.get_subdirs(folder) + os.chdir(base_dir) + for subdir in subdirs: + subdir_name = subdirs[subdir].replace('/', '-') + subdir_path = os.path.join(base_dir, subdir_name) + log.debug(subdir_path) + self.create_dir(subdir_path) + os.chdir(subdir_path) + self.download_folder_rec(subdir, subdir_path) + + def download_course(self, course, base_dir): + log.info('crawling course ' + course) + self.create_dir(base_dir) + os.chdir(base_dir) + root = self.studip.get_top_folder(course) + self.download_folder_rec(root, base_dir) + + def download_curr_courses(self, base_dir): + log.info('Start crawling all current courses') + self.create_dir(base_dir) + curr_courses = self.studip.get_curr_courses( + self.studip.get_uid(), self.studip.get_curr_semester()) + os.chdir(base_dir) + for course in curr_courses: + log.debug('course is ' + curr_courses[course]) + course_name = curr_courses[course].replace('/', '-') + path = os.path.join(base_dir, course_name) + self.download_course(course, path) + + def create_dir(self, dir): + if not os.path.exists(dir): + log.info('creating folder' + dir) + os.mkdir(dir) diff --git a/src/mysql.py b/src/mysql.py index 4a2c5ab..b5b64c0 100755 --- a/src/mysql.py +++ b/src/mysql.py @@ -1,27 +1,27 @@ -#!/bin/env python3 import time +import logging as log import pymysql class Database: - def __init__(self): - self.HOST = None - self.PORT = None - self.DB_NAME = None - self.USER = None - self.PASSW = None - self.TABLE_FILE = None + def __init__(self, host, port, name, user, passwd, reset_dl): + self.HOST = host + self.PORT = port + self.NAME = name + self.USER = user + self.PASSWD = passwd + self.RESET_DL = reset_dl self.TABLE_FILE = 'files' - self.RESET_DL_DATE = False + self.setup_db() def connect(self): return pymysql.connect( host=self.HOST, port=self.PORT, user=self.USER, - password=self.PASSW, + password=self.PASSWD, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor ) @@ -29,21 +29,21 @@ class Database: def setup_db(self): db = self.connect() crs = db.cursor() - sql_query = "CREATE DATABASE IF NOT EXISTS " + self.DB_NAME + sql_query = "CREATE DATABASE IF NOT EXISTS " + self.NAME crs.execute(sql_query) - db.select_db(self.DB_NAME) + db.select_db(self.NAME) query = "CREATE TABLE IF NOT EXISTS " + self.TABLE_FILE + \ "(id CHAR(32) NOT NULL," + \ "ch_date INT(11) NOT NULL," + \ "PRIMARY KEY(id))" crs.execute(query) - print(db) + log.debug(db) def set_last_file_dl(self, file_id, time): db = self.connect() - db.select_db(self.DB_NAME) + db.select_db(self.NAME) crs = db.cursor() - print('file: ', file_id, ' time: ', time) + log.debug('file: ' + file_id + ' time: ' + time) query = "INSERT INTO " + self.TABLE_FILE + "(`id`,`ch_date`)" + \ "VALUES ('" + file_id + "','" + time + "')" + \ "ON DUPLICATE KEY UPDATE `ch_date` = '" + time + "'" @@ -51,10 +51,10 @@ class Database: db.commit() def get_last_file_dl(self, file_id): - if self.RESET_DL_DATE: + if self.RESET_DL: return None db = self.connect() - db.select_db(self.DB_NAME) + db.select_db(self.NAME) crs = db.cursor() query = "SELECT ch_date FROM files WHERE id ='" + file_id + "'" crs.execute(query) diff --git a/src/run.py b/src/run.py index 4a35c25..a706df9 100755 --- a/src/run.py +++ b/src/run.py @@ -1,9 +1,12 @@ #!/bin/env python3 import os +import sys import argparse +import logging as log -import studip -import mysql +from studip import Studip +from crawler import Crawler +from mysql import Database parser = argparse.ArgumentParser(description='Download Files from StudIP.') @@ -16,32 +19,49 @@ parser.add_argument('-p', '--passwd', type=str, parser.add_argument('-s', '--url', type=str, help='studip url', required=True) parser.add_argument('--chunk', type=int, default=1024 * 1024, help='chunksize for downloading data') -parser.add_argument('-r', '--reset_dl_date', action='store_true', help='downloads everything and ignores last download date') +parser.add_argument('-r', '--reset_dl_date', action='store_true', + help='downloads everything and ignores last download date') parser.add_argument('--host', type=str, default='localhost', help='mysql host') parser.add_argument('--port', type=int, default=3306, help='mysql port') -parser.add_argument('--db_name', type=str, default='studip', help='mysql database name') -parser.add_argument('--db_user', type=str, default='root', help='mysql database user') -parser.add_argument('--db_passwd', type=str, default='secret-pw', help='mysql database password') +parser.add_argument('--db_name', type=str, default='studip', + help='mysql database name') +parser.add_argument('--db_user', type=str, default='root', + help='mysql database user') +parser.add_argument('--db_passwd', type=str, + default='secret-pw', help='mysql database password') +parser.add_argument('-d', '--debug_output', action='store_true', + help='display debug information about the process') +parser.add_argument('-q', '--quiet', action='store_true', + help='only display most important output') +parser.add_argument('-l', '--log_file', action='store_true', + help='saves log to a log file named "log.txt"') args = parser.parse_args() +if args.quiet: + log_level = log.WARNING +elif args.debug_output: + log_level = log.DEBUG +else: + log_level = log.INFO + +if args.log_file: + log.basicConfig(level=log_level, filename='log.txt') +else: + log.basicConfig(level=log_level) + BASE_DIR = os.path.abspath(args.output) USERNAME = args.user PASSWORD = args.passwd -db = mysql.Database() - -db.HOST = args.host -db.PORT = args.port -db.DB_NAME = args.db_name -db.USER = args.db_user -db.PASSW = args.db_passwd -db.RESET_DL_DATE = args.reset_dl_date -db.setup_db() +db = Database(args.host, args.port, args.db_name, + args.db_user, args.db_passwd, args.reset_dl_date) -crwlr = studip.Crawler(db) +studip = Studip(args.chunk, args.url, (USERNAME, PASSWORD), db) -crwlr.CHUNK_SIZE = args.chunk -crwlr.STUDIP_DOMAIN = args.url -crwlr.USER = (USERNAME, PASSWORD) +crawler = Crawler(studip) -crwlr.download_curr_courses(BASE_DIR) +# Start crawling +try: + crawler.download_curr_courses(BASE_DIR) +except KeyboardInterrupt: + sys.exit(0) diff --git a/src/studip.py b/src/studip.py index dc27e55..99da36a 100755 --- a/src/studip.py +++ b/src/studip.py @@ -1,35 +1,30 @@ -#!/bin/env python3 import time -import os -import argparse +import logging as log from tqdm import tqdm import requests as req from requests.auth import HTTPBasicAuth -class Crawler: +class Studip: - def __init__(self, db): - self.CHUNK_SIZE = None - self.STUDIP_DOMAIN = None - self.USER = None + def __init__(self, chunk_size, domain, user, db): + self.CHUNK_SIZE = chunk_size + self.DOMAIN = domain + self.USER = user self.db = db - def create_dir(self, dir): - if not os.path.exists(dir): - print('creating folder', dir) - os.mkdir(dir) + def auth_req(self, url): + url = self.DOMAIN + url + return req.get(url, auth=self.USER) def get_uid(self): - url = self.STUDIP_DOMAIN + '/api.php/user/' - rsp = req.get(url, auth=self.USER) + rsp = self.auth_req('/api.php/user/') user_id = rsp.json()['user_id'] return user_id def get_curr_semester(self): - url = self.STUDIP_DOMAIN + '/api.php/semesters/' - rsp = req.get(url, auth=self.USER) + rsp = self.auth_req('/api.php/semesters/') curr_time = int(str(int(time.time()))) semesters = rsp.json()['collection'] for sem_uri in semesters: @@ -41,8 +36,7 @@ class Crawler: return 0 def get_ordered_semesters(self): - url = self.STUDIP_DOMAIN + '/api.php/semesters/' - rsp = req.get(url, auth=self.USER) + rsp = self.auth_req('/api.php/semesters/') semesters = rsp.json()['collection'] order_sems = [] for sem_uri in semesters: @@ -50,8 +44,7 @@ class Crawler: return order_sems def get_curr_courses(self, user_id, semester): - url = self.STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses' - rsp = req.get(url, auth=self.USER) + rsp = self.auth_req('/api.php/user/' + user_id + '/courses') ord_sems = self.get_ordered_semesters() courses = rsp.json()['collection'] i = 0 @@ -76,15 +69,13 @@ class Crawler: return course_list def get_top_folder(self, course): - url = self.STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder' - rsp = req.get(url, auth=self.USER) + rsp = self.auth_req('/api.php/course/' + course + '/top_folder') top_folder = rsp.json() tf_id = top_folder['id'] return(tf_id) def get_docs(self, folder): - url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder - rsp = req.get(url, auth=self.USER) + rsp = self.auth_req('/api.php/folder/' + folder) docs = rsp.json()['file_refs'] res_docs = [] for doc in docs: @@ -93,16 +84,14 @@ class Crawler: return(res_docs) def download(self, doc): - url1 = self.STUDIP_DOMAIN + '/api.php/file/' + doc - rsp1 = req.get(url1, auth=self.USER) + rsp1 = self.auth_req('/api.php/file/' + doc) doc_name = rsp1.json()['name'] doc_chdate = rsp1.json()['chdate'] last_dl = self.db.get_last_file_dl(doc) if last_dl == None or last_dl < doc_chdate: - print('downloading ', doc_name) - url2 = self.STUDIP_DOMAIN + '/api.php/file/' + doc + '/download' - rsp2 = req.get(url2, auth=self.USER, stream=True) + rsp2 = self.auth_req('/api.php/file/' + doc + '/download') total_size = int(rsp2.headers.get('content-length', 0)) + print('downloading ' + doc_name) progbar = tqdm(total=total_size, unit='iB', unit_scale=True) with open(doc_name, 'wb') as doc_file: for chunk in rsp2.iter_content(self.CHUNK_SIZE): @@ -111,8 +100,7 @@ class Crawler: self.db.set_last_file_dl(str(doc), str(int(time.time()))) def get_subdirs(self, folder): - url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder - rsp = req.get(url, auth=self.USER) + rsp = self.auth_req('/api.php/folder/' + folder) subdirs = rsp.json()['subfolders'] docs = rsp.json()['file_refs'] res_subdirs = {} @@ -121,42 +109,3 @@ class Crawler: sub_name = subdir['name'] res_subdirs[sub_id] = sub_name return res_subdirs - - def download_folder(self, folder): - docs = self.get_docs(folder) - for doc in docs: - print('found doc ', doc) - self.download(doc) - - def download_folder_rec(self, folder, base_dir): - print('folder ', folder) - self.create_dir(base_dir) - self.download_folder(folder) - subdirs = self.get_subdirs(folder) - os.chdir(base_dir) - for subdir in subdirs: - subdir_name = subdirs[subdir].replace('/', '-') - subdir_path = os.path.join(base_dir, subdir_name) - print(subdir_path) - self.create_dir(subdir_path) - os.chdir(subdir_path) - self.download_folder_rec(subdir, subdir_path) - - def download_course(self, course, base_dir): - print('course ', course) - self.create_dir(base_dir) - os.chdir(base_dir) - root = self.get_top_folder(course) - self.download_folder_rec(root, base_dir) - - def download_curr_courses(self, base_dir): - print('Start downloading all current courses') - self.create_dir(base_dir) - curr_courses = self.get_curr_courses( - self.get_uid(), self.get_curr_semester()) - os.chdir(base_dir) - for course in curr_courses: - print('course is ', curr_courses[course]) - course_name = curr_courses[course].replace('/', '-') - path = os.path.join(base_dir, course_name) - self.download_course(course, path)