diff --git a/README.md b/README.md index c0820ed..b02d348 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,10 @@ If you run the program again it only downloads files that have changed since the - [x] Possible reset of download date - [x] Incremental file download - [x] Store id and chdate of downloaded files -- [ ] Logging +- [x] Logging - [x] Console log - - [ ] Log file + - [x] Log file + - [x] Specify log level ## Installation diff --git a/src/.gitignore b/src/.gitignore index a00feb9..89992eb 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -1,2 +1,3 @@ __pycache__ data +log.txt diff --git a/src/crawler.py b/src/crawler.py new file mode 100644 index 0000000..2d87ce2 --- /dev/null +++ b/src/crawler.py @@ -0,0 +1,54 @@ +import os +import logging as log + +from studip import Studip + + +class Crawler: + + def __init__(self, studip): + self.studip = studip + + def download_folder(self, folder): + docs = self.studip.get_docs(folder) + for doc in docs: + log.info('found doc ' + doc) + self.studip.download(doc) + + def download_folder_rec(self, folder, base_dir): + log.info('crawling folder ' + folder) + self.create_dir(base_dir) + self.download_folder(folder) + subdirs = self.studip.get_subdirs(folder) + os.chdir(base_dir) + for subdir in subdirs: + subdir_name = subdirs[subdir].replace('/', '-') + subdir_path = os.path.join(base_dir, subdir_name) + log.debug(subdir_path) + self.create_dir(subdir_path) + os.chdir(subdir_path) + self.download_folder_rec(subdir, subdir_path) + + def download_course(self, course, base_dir): + log.info('crawling course ' + course) + self.create_dir(base_dir) + os.chdir(base_dir) + root = self.studip.get_top_folder(course) + self.download_folder_rec(root, base_dir) + + def download_curr_courses(self, base_dir): + log.info('Start crawling all current courses') + self.create_dir(base_dir) + curr_courses = self.studip.get_curr_courses( + self.studip.get_uid(), self.studip.get_curr_semester()) + os.chdir(base_dir) + for course in curr_courses: + log.debug('course is ' + curr_courses[course]) + course_name = curr_courses[course].replace('/', '-') + path = os.path.join(base_dir, course_name) + self.download_course(course, path) + + def create_dir(self, dir): + if not os.path.exists(dir): + log.info('creating folder' + dir) + os.mkdir(dir) diff --git a/src/mysql.py b/src/mysql.py index 4a2c5ab..b5b64c0 100755 --- a/src/mysql.py +++ b/src/mysql.py @@ -1,27 +1,27 @@ -#!/bin/env python3 import time +import logging as log import pymysql class Database: - def __init__(self): - self.HOST = None - self.PORT = None - self.DB_NAME = None - self.USER = None - self.PASSW = None - self.TABLE_FILE = None + def __init__(self, host, port, name, user, passwd, reset_dl): + self.HOST = host + self.PORT = port + self.NAME = name + self.USER = user + self.PASSWD = passwd + self.RESET_DL = reset_dl self.TABLE_FILE = 'files' - self.RESET_DL_DATE = False + self.setup_db() def connect(self): return pymysql.connect( host=self.HOST, port=self.PORT, user=self.USER, - password=self.PASSW, + password=self.PASSWD, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor ) @@ -29,21 +29,21 @@ class Database: def setup_db(self): db = self.connect() crs = db.cursor() - sql_query = "CREATE DATABASE IF NOT EXISTS " + self.DB_NAME + sql_query = "CREATE DATABASE IF NOT EXISTS " + self.NAME crs.execute(sql_query) - db.select_db(self.DB_NAME) + db.select_db(self.NAME) query = "CREATE TABLE IF NOT EXISTS " + self.TABLE_FILE + \ "(id CHAR(32) NOT NULL," + \ "ch_date INT(11) NOT NULL," + \ "PRIMARY KEY(id))" crs.execute(query) - print(db) + log.debug(db) def set_last_file_dl(self, file_id, time): db = self.connect() - db.select_db(self.DB_NAME) + db.select_db(self.NAME) crs = db.cursor() - print('file: ', file_id, ' time: ', time) + log.debug('file: ' + file_id + ' time: ' + time) query = "INSERT INTO " + self.TABLE_FILE + "(`id`,`ch_date`)" + \ "VALUES ('" + file_id + "','" + time + "')" + \ "ON DUPLICATE KEY UPDATE `ch_date` = '" + time + "'" @@ -51,10 +51,10 @@ class Database: db.commit() def get_last_file_dl(self, file_id): - if self.RESET_DL_DATE: + if self.RESET_DL: return None db = self.connect() - db.select_db(self.DB_NAME) + db.select_db(self.NAME) crs = db.cursor() query = "SELECT ch_date FROM files WHERE id ='" + file_id + "'" crs.execute(query) diff --git a/src/run.py b/src/run.py index 4a35c25..a706df9 100755 --- a/src/run.py +++ b/src/run.py @@ -1,9 +1,12 @@ #!/bin/env python3 import os +import sys import argparse +import logging as log -import studip -import mysql +from studip import Studip +from crawler import Crawler +from mysql import Database parser = argparse.ArgumentParser(description='Download Files from StudIP.') @@ -16,32 +19,49 @@ parser.add_argument('-p', '--passwd', type=str, parser.add_argument('-s', '--url', type=str, help='studip url', required=True) parser.add_argument('--chunk', type=int, default=1024 * 1024, help='chunksize for downloading data') -parser.add_argument('-r', '--reset_dl_date', action='store_true', help='downloads everything and ignores last download date') +parser.add_argument('-r', '--reset_dl_date', action='store_true', + help='downloads everything and ignores last download date') parser.add_argument('--host', type=str, default='localhost', help='mysql host') parser.add_argument('--port', type=int, default=3306, help='mysql port') -parser.add_argument('--db_name', type=str, default='studip', help='mysql database name') -parser.add_argument('--db_user', type=str, default='root', help='mysql database user') -parser.add_argument('--db_passwd', type=str, default='secret-pw', help='mysql database password') +parser.add_argument('--db_name', type=str, default='studip', + help='mysql database name') +parser.add_argument('--db_user', type=str, default='root', + help='mysql database user') +parser.add_argument('--db_passwd', type=str, + default='secret-pw', help='mysql database password') +parser.add_argument('-d', '--debug_output', action='store_true', + help='display debug information about the process') +parser.add_argument('-q', '--quiet', action='store_true', + help='only display most important output') +parser.add_argument('-l', '--log_file', action='store_true', + help='saves log to a log file named "log.txt"') args = parser.parse_args() +if args.quiet: + log_level = log.WARNING +elif args.debug_output: + log_level = log.DEBUG +else: + log_level = log.INFO + +if args.log_file: + log.basicConfig(level=log_level, filename='log.txt') +else: + log.basicConfig(level=log_level) + BASE_DIR = os.path.abspath(args.output) USERNAME = args.user PASSWORD = args.passwd -db = mysql.Database() - -db.HOST = args.host -db.PORT = args.port -db.DB_NAME = args.db_name -db.USER = args.db_user -db.PASSW = args.db_passwd -db.RESET_DL_DATE = args.reset_dl_date -db.setup_db() +db = Database(args.host, args.port, args.db_name, + args.db_user, args.db_passwd, args.reset_dl_date) -crwlr = studip.Crawler(db) +studip = Studip(args.chunk, args.url, (USERNAME, PASSWORD), db) -crwlr.CHUNK_SIZE = args.chunk -crwlr.STUDIP_DOMAIN = args.url -crwlr.USER = (USERNAME, PASSWORD) +crawler = Crawler(studip) -crwlr.download_curr_courses(BASE_DIR) +# Start crawling +try: + crawler.download_curr_courses(BASE_DIR) +except KeyboardInterrupt: + sys.exit(0) diff --git a/src/studip.py b/src/studip.py index dc27e55..99da36a 100755 --- a/src/studip.py +++ b/src/studip.py @@ -1,35 +1,30 @@ -#!/bin/env python3 import time -import os -import argparse +import logging as log from tqdm import tqdm import requests as req from requests.auth import HTTPBasicAuth -class Crawler: +class Studip: - def __init__(self, db): - self.CHUNK_SIZE = None - self.STUDIP_DOMAIN = None - self.USER = None + def __init__(self, chunk_size, domain, user, db): + self.CHUNK_SIZE = chunk_size + self.DOMAIN = domain + self.USER = user self.db = db - def create_dir(self, dir): - if not os.path.exists(dir): - print('creating folder', dir) - os.mkdir(dir) + def auth_req(self, url): + url = self.DOMAIN + url + return req.get(url, auth=self.USER) def get_uid(self): - url = self.STUDIP_DOMAIN + '/api.php/user/' - rsp = req.get(url, auth=self.USER) + rsp = self.auth_req('/api.php/user/') user_id = rsp.json()['user_id'] return user_id def get_curr_semester(self): - url = self.STUDIP_DOMAIN + '/api.php/semesters/' - rsp = req.get(url, auth=self.USER) + rsp = self.auth_req('/api.php/semesters/') curr_time = int(str(int(time.time()))) semesters = rsp.json()['collection'] for sem_uri in semesters: @@ -41,8 +36,7 @@ class Crawler: return 0 def get_ordered_semesters(self): - url = self.STUDIP_DOMAIN + '/api.php/semesters/' - rsp = req.get(url, auth=self.USER) + rsp = self.auth_req('/api.php/semesters/') semesters = rsp.json()['collection'] order_sems = [] for sem_uri in semesters: @@ -50,8 +44,7 @@ class Crawler: return order_sems def get_curr_courses(self, user_id, semester): - url = self.STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses' - rsp = req.get(url, auth=self.USER) + rsp = self.auth_req('/api.php/user/' + user_id + '/courses') ord_sems = self.get_ordered_semesters() courses = rsp.json()['collection'] i = 0 @@ -76,15 +69,13 @@ class Crawler: return course_list def get_top_folder(self, course): - url = self.STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder' - rsp = req.get(url, auth=self.USER) + rsp = self.auth_req('/api.php/course/' + course + '/top_folder') top_folder = rsp.json() tf_id = top_folder['id'] return(tf_id) def get_docs(self, folder): - url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder - rsp = req.get(url, auth=self.USER) + rsp = self.auth_req('/api.php/folder/' + folder) docs = rsp.json()['file_refs'] res_docs = [] for doc in docs: @@ -93,16 +84,14 @@ class Crawler: return(res_docs) def download(self, doc): - url1 = self.STUDIP_DOMAIN + '/api.php/file/' + doc - rsp1 = req.get(url1, auth=self.USER) + rsp1 = self.auth_req('/api.php/file/' + doc) doc_name = rsp1.json()['name'] doc_chdate = rsp1.json()['chdate'] last_dl = self.db.get_last_file_dl(doc) if last_dl == None or last_dl < doc_chdate: - print('downloading ', doc_name) - url2 = self.STUDIP_DOMAIN + '/api.php/file/' + doc + '/download' - rsp2 = req.get(url2, auth=self.USER, stream=True) + rsp2 = self.auth_req('/api.php/file/' + doc + '/download') total_size = int(rsp2.headers.get('content-length', 0)) + print('downloading ' + doc_name) progbar = tqdm(total=total_size, unit='iB', unit_scale=True) with open(doc_name, 'wb') as doc_file: for chunk in rsp2.iter_content(self.CHUNK_SIZE): @@ -111,8 +100,7 @@ class Crawler: self.db.set_last_file_dl(str(doc), str(int(time.time()))) def get_subdirs(self, folder): - url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder - rsp = req.get(url, auth=self.USER) + rsp = self.auth_req('/api.php/folder/' + folder) subdirs = rsp.json()['subfolders'] docs = rsp.json()['file_refs'] res_subdirs = {} @@ -121,42 +109,3 @@ class Crawler: sub_name = subdir['name'] res_subdirs[sub_id] = sub_name return res_subdirs - - def download_folder(self, folder): - docs = self.get_docs(folder) - for doc in docs: - print('found doc ', doc) - self.download(doc) - - def download_folder_rec(self, folder, base_dir): - print('folder ', folder) - self.create_dir(base_dir) - self.download_folder(folder) - subdirs = self.get_subdirs(folder) - os.chdir(base_dir) - for subdir in subdirs: - subdir_name = subdirs[subdir].replace('/', '-') - subdir_path = os.path.join(base_dir, subdir_name) - print(subdir_path) - self.create_dir(subdir_path) - os.chdir(subdir_path) - self.download_folder_rec(subdir, subdir_path) - - def download_course(self, course, base_dir): - print('course ', course) - self.create_dir(base_dir) - os.chdir(base_dir) - root = self.get_top_folder(course) - self.download_folder_rec(root, base_dir) - - def download_curr_courses(self, base_dir): - print('Start downloading all current courses') - self.create_dir(base_dir) - curr_courses = self.get_curr_courses( - self.get_uid(), self.get_curr_semester()) - os.chdir(base_dir) - for course in curr_courses: - print('course is ', curr_courses[course]) - course_name = curr_courses[course].replace('/', '-') - path = os.path.join(base_dir, course_name) - self.download_course(course, path)