From fa36e0f29ea5541831189ab119b1d50b5702dbeb Mon Sep 17 00:00:00 2001 From: TiynGER Date: Sun, 7 Jun 2020 12:49:08 +0200 Subject: [PATCH] database: files id and chdates are stored - mysql creates database and tables to given mysql if not existent already - mysql reads last change values from db - mysql saves ch_date after downloading - run now takes care for the variables of mysql and studip --- .gitignore | 3 +- README.md | 16 +- crawler.py | 202 ----------------------- src/.gitignore | 2 + src/mysql.py | 64 +++++++ requirements.txt => src/requirements.txt | 1 + src/run.py | 47 ++++++ src/studip.py | 162 ++++++++++++++++++ 8 files changed, 286 insertions(+), 211 deletions(-) delete mode 100755 crawler.py create mode 100644 src/.gitignore create mode 100755 src/mysql.py rename requirements.txt => src/requirements.txt (66%) create mode 100755 src/run.py create mode 100755 src/studip.py diff --git a/.gitignore b/.gitignore index ac36c29..aa0d57e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1 @@ -last_dl.txt -data +database diff --git a/README.md b/README.md index ceb9565..c0820ed 100644 --- a/README.md +++ b/README.md @@ -13,28 +13,30 @@ If you run the program again it only downloads files that have changed since the - [x] Specify Stud.IP-URL - [x] Specify output directory - [x] Specify chunk size to download big files + - [x] Specify all important database variables - [x] Only download files after given date - [x] Save and read download date - [x] Possible reset of download date -- [ ] Incremental file download - - [ ] Indexing downloaded files and folders +- [x] Incremental file download + - [x] Store id and chdate of downloaded files - [ ] Logging - [x] Console log - [ ] Log file ## Installation +- create an instance of - `git clone https://github.com/tiyn/studip-crawler` -- `cd studip-crawler` +- `cd studip-crawler/src/` - `pip3install -r requirements` - install dependencies ## Usage -Just run the file via `python3 crawler.py [options]`. -Alternatively to `python3 crawler.py` you can give yourself permissions using `chmod +x crawler.py [options]` and -run it with `./crawler.py [options]`. +Just run the file via `python3 run.py [options]`. +Alternatively to `python3 run.py` you can give yourself permissions using `chmod +x run.py [options]` and +run it with `./run.py [options]`. There are several options required to work. -Run `python3 crawler.py -h` for a help menu and see which ones are important for you. +Run `python3 run.py -h` for a help menu and see which ones are important for you. ## Tested StudIP instances diff --git a/crawler.py b/crawler.py deleted file mode 100755 index ef64e9b..0000000 --- a/crawler.py +++ /dev/null @@ -1,202 +0,0 @@ -#!/bin/env python3 -import time -import os -import argparse - -from tqdm import tqdm -import requests as req -from requests.auth import HTTPBasicAuth - - -def create_dir(dir): - if not os.path.exists(dir): - print('creating folder', dir) - os.mkdir(dir) - - -def set_last_dl(time): - last_dl_file = open('last_dl.txt', 'w') - last_dl_file.write(str(time).split('.')[0]) - - -def get_last_dl(): - try: - last_dl_file = open('last_dl.txt', 'r') - return int(last_dl_file.read()) - except: - return None - -parser = argparse.ArgumentParser(description='Download Files from StudIP.') -parser.add_argument('-o', '--output', type=str, - default='./data', help='path to output directory') -parser.add_argument('-u', '--user', type=str, help='studip username', required=True) -parser.add_argument('-p', '--passw', type=str, help='studip password', required=True) -parser.add_argument('-s', '--url', type=str, help='studip url', required=True) -parser.add_argument('-c', '--chunk', type=int, default=1024 * - 1024, help='chunksize for downloading data') -parser.add_argument('-r', '--reset_dl_date', action='store_true') - -args = parser.parse_args() - -BASE_DIR = os.path.abspath(args.output) -CHUNK_SIZE = args.chunk -STUDIP_DOMAIN = args.url -USERNAME = args.user -PASSWORD = args.passw -USER = (USERNAME, PASSWORD) -if args.reset_dl_date: - set_last_dl(None) -LAST_DOWNLOAD = get_last_dl() - - -def get_uid(): - url = STUDIP_DOMAIN + '/api.php/user/' - rsp = req.get(url, auth=USER) - user_id = rsp.json()['user_id'] - return user_id - - -def get_curr_semester(): - url = STUDIP_DOMAIN + '/api.php/semesters/' - rsp = req.get(url, auth=USER) - curr_time = int(str(time.time()).split('.')[0]) - semesters = rsp.json()['collection'] - for sem_uri in semesters: - semester = semesters[sem_uri] - sem_begin = semester['begin'] - sem_end = semester['end'] - if sem_begin < curr_time < sem_end: - return sem_uri - return 0 - - -def get_ordered_semesters(): - url = STUDIP_DOMAIN + '/api.php/semesters/' - rsp = req.get(url, auth=USER) - semesters = rsp.json()['collection'] - order_sems = [] - for sem_uri in semesters: - order_sems.append(sem_uri) - return order_sems - - -def get_curr_courses(user_id, semester): - url = STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses' - rsp = req.get(url, auth=USER) - ord_sems = get_ordered_semesters() - courses = rsp.json()['collection'] - i = 0 - course_list = {} - for course_uri in courses: - course = courses[course_uri] - start_sem = course['start_semester'] - if start_sem != None: - start_ind = ord_sems.index(start_sem) - else: - start_ind = 100 - end_sem = course['end_semester'] - if end_sem != None: - end_ind = ord_sems.index(end_sem) - else: - end_ind = 100 - curr_ind = ord_sems.index(semester) - if start_ind <= curr_ind <= end_ind: - course_title = course['title'] - course_id = course['course_id'] - course_list[course_id] = course_title - return course_list - - -def get_top_folder(course): - url = STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder' - rsp = req.get(url, auth=USER) - top_folder = rsp.json() - tf_id = top_folder['id'] - return(tf_id) - - -def get_docs(folder): - url = STUDIP_DOMAIN + '/api.php/folder/' + folder - rsp = req.get(url, auth=USER) - docs = rsp.json()['file_refs'] - res_docs = [] - for doc in docs: - doc_id = doc['id'] - res_docs.append(doc_id) - return(res_docs) - - -def download(doc, time): - url1 = STUDIP_DOMAIN + '/api.php/file/' + doc - rsp1 = req.get(url1, auth=USER) - doc_name = rsp1.json()['name'] - doc_chdate = rsp1.json()['chdate'] - if time == None or time < doc_chdate: - print('downloading ', doc_name) - url2 = STUDIP_DOMAIN + '/api.php/file/' + doc + '/download' - rsp2 = req.get(url2, auth=USER, stream=True) - total_size = int(rsp2.headers.get('content-length', 0)) - progbar = tqdm(total=total_size, unit='iB', unit_scale=True) - with open(doc_name, 'wb') as doc: - for chunk in rsp2.iter_content(CHUNK_SIZE): - progbar.update(len(chunk)) - doc.write(chunk) - - -def get_subdirs(folder): - url = STUDIP_DOMAIN + '/api.php/folder/' + folder - rsp = req.get(url, auth=USER) - subdirs = rsp.json()['subfolders'] - docs = rsp.json()['file_refs'] - res_subdirs = {} - for subdir in subdirs: - sub_id = subdir['id'] - sub_name = subdir['name'] - res_subdirs[sub_id] = sub_name - return res_subdirs - - -def download_folder(folder, time): - docs = get_docs(folder) - for doc in docs: - print('found doc ', doc) - download(doc, time) - - -def download_folder_rec(folder, time, base_dir): - print('folder ', folder) - create_dir(base_dir) - download_folder(folder, time) - subdirs = get_subdirs(folder) - os.chdir(base_dir) - for subdir in subdirs: - subdir_name = subdirs[subdir].replace('/', '-') - subdir_path = os.path.join(base_dir, subdir_name) - print(subdir_path) - create_dir(subdir_path) - os.chdir(subdir_path) - download_folder_rec(subdir, time, subdir_path) - - -def download_course(course, time, base_dir): - print('course ', course) - create_dir(base_dir) - os.chdir(base_dir) - root = get_top_folder(course) - download_folder_rec(root, time, base_dir) - - -def download_curr_courses(time, base_dir): - print('Start downloading all current courses') - create_dir(base_dir) - curr_courses = get_curr_courses(get_uid(), get_curr_semester()) - os.chdir(base_dir) - for course in curr_courses: - print('course is ', curr_courses[course]) - course_name = curr_courses[course].replace('/', '-') - path = os.path.join(base_dir, course_name) - download_course(course, time, path) - - -download_curr_courses(LAST_DOWNLOAD, BASE_DIR) -set_last_dl(time.time()) diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..a00feb9 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +data diff --git a/src/mysql.py b/src/mysql.py new file mode 100755 index 0000000..4a2c5ab --- /dev/null +++ b/src/mysql.py @@ -0,0 +1,64 @@ +#!/bin/env python3 +import time + +import pymysql + + +class Database: + + def __init__(self): + self.HOST = None + self.PORT = None + self.DB_NAME = None + self.USER = None + self.PASSW = None + self.TABLE_FILE = None + self.TABLE_FILE = 'files' + self.RESET_DL_DATE = False + + def connect(self): + return pymysql.connect( + host=self.HOST, + port=self.PORT, + user=self.USER, + password=self.PASSW, + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor + ) + + def setup_db(self): + db = self.connect() + crs = db.cursor() + sql_query = "CREATE DATABASE IF NOT EXISTS " + self.DB_NAME + crs.execute(sql_query) + db.select_db(self.DB_NAME) + query = "CREATE TABLE IF NOT EXISTS " + self.TABLE_FILE + \ + "(id CHAR(32) NOT NULL," + \ + "ch_date INT(11) NOT NULL," + \ + "PRIMARY KEY(id))" + crs.execute(query) + print(db) + + def set_last_file_dl(self, file_id, time): + db = self.connect() + db.select_db(self.DB_NAME) + crs = db.cursor() + print('file: ', file_id, ' time: ', time) + query = "INSERT INTO " + self.TABLE_FILE + "(`id`,`ch_date`)" + \ + "VALUES ('" + file_id + "','" + time + "')" + \ + "ON DUPLICATE KEY UPDATE `ch_date` = '" + time + "'" + crs.execute(query) + db.commit() + + def get_last_file_dl(self, file_id): + if self.RESET_DL_DATE: + return None + db = self.connect() + db.select_db(self.DB_NAME) + crs = db.cursor() + query = "SELECT ch_date FROM files WHERE id ='" + file_id + "'" + crs.execute(query) + res = crs.fetchone() + if res != None: + return res['ch_date'] + return None diff --git a/requirements.txt b/src/requirements.txt similarity index 66% rename from requirements.txt rename to src/requirements.txt index 15918fe..23012f7 100644 --- a/requirements.txt +++ b/src/requirements.txt @@ -1,2 +1,3 @@ tqdm==4.46.1 requests==2.23.0 +PyMySQL==0.9.3 diff --git a/src/run.py b/src/run.py new file mode 100755 index 0000000..4a35c25 --- /dev/null +++ b/src/run.py @@ -0,0 +1,47 @@ +#!/bin/env python3 +import os +import argparse + +import studip +import mysql + + +parser = argparse.ArgumentParser(description='Download Files from StudIP.') +parser.add_argument('-o', '--output', type=str, + default='./data', help='path to output directory') +parser.add_argument('-u', '--user', type=str, + help='studip username', required=True) +parser.add_argument('-p', '--passwd', type=str, + help='studip password', required=True) +parser.add_argument('-s', '--url', type=str, help='studip url', required=True) +parser.add_argument('--chunk', type=int, default=1024 * + 1024, help='chunksize for downloading data') +parser.add_argument('-r', '--reset_dl_date', action='store_true', help='downloads everything and ignores last download date') +parser.add_argument('--host', type=str, default='localhost', help='mysql host') +parser.add_argument('--port', type=int, default=3306, help='mysql port') +parser.add_argument('--db_name', type=str, default='studip', help='mysql database name') +parser.add_argument('--db_user', type=str, default='root', help='mysql database user') +parser.add_argument('--db_passwd', type=str, default='secret-pw', help='mysql database password') +args = parser.parse_args() + +BASE_DIR = os.path.abspath(args.output) +USERNAME = args.user +PASSWORD = args.passwd + +db = mysql.Database() + +db.HOST = args.host +db.PORT = args.port +db.DB_NAME = args.db_name +db.USER = args.db_user +db.PASSW = args.db_passwd +db.RESET_DL_DATE = args.reset_dl_date +db.setup_db() + +crwlr = studip.Crawler(db) + +crwlr.CHUNK_SIZE = args.chunk +crwlr.STUDIP_DOMAIN = args.url +crwlr.USER = (USERNAME, PASSWORD) + +crwlr.download_curr_courses(BASE_DIR) diff --git a/src/studip.py b/src/studip.py new file mode 100755 index 0000000..dc27e55 --- /dev/null +++ b/src/studip.py @@ -0,0 +1,162 @@ +#!/bin/env python3 +import time +import os +import argparse + +from tqdm import tqdm +import requests as req +from requests.auth import HTTPBasicAuth + + +class Crawler: + + def __init__(self, db): + self.CHUNK_SIZE = None + self.STUDIP_DOMAIN = None + self.USER = None + self.db = db + + def create_dir(self, dir): + if not os.path.exists(dir): + print('creating folder', dir) + os.mkdir(dir) + + def get_uid(self): + url = self.STUDIP_DOMAIN + '/api.php/user/' + rsp = req.get(url, auth=self.USER) + user_id = rsp.json()['user_id'] + return user_id + + def get_curr_semester(self): + url = self.STUDIP_DOMAIN + '/api.php/semesters/' + rsp = req.get(url, auth=self.USER) + curr_time = int(str(int(time.time()))) + semesters = rsp.json()['collection'] + for sem_uri in semesters: + semester = semesters[sem_uri] + sem_begin = semester['begin'] + sem_end = semester['end'] + if sem_begin < curr_time < sem_end: + return sem_uri + return 0 + + def get_ordered_semesters(self): + url = self.STUDIP_DOMAIN + '/api.php/semesters/' + rsp = req.get(url, auth=self.USER) + semesters = rsp.json()['collection'] + order_sems = [] + for sem_uri in semesters: + order_sems.append(sem_uri) + return order_sems + + def get_curr_courses(self, user_id, semester): + url = self.STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses' + rsp = req.get(url, auth=self.USER) + ord_sems = self.get_ordered_semesters() + courses = rsp.json()['collection'] + i = 0 + course_list = {} + for course_uri in courses: + course = courses[course_uri] + start_sem = course['start_semester'] + if start_sem != None: + start_ind = ord_sems.index(start_sem) + else: + start_ind = 100 + end_sem = course['end_semester'] + if end_sem != None: + end_ind = ord_sems.index(end_sem) + else: + end_ind = 100 + curr_ind = ord_sems.index(semester) + if start_ind <= curr_ind <= end_ind: + course_title = course['title'] + course_id = course['course_id'] + course_list[course_id] = course_title + return course_list + + def get_top_folder(self, course): + url = self.STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder' + rsp = req.get(url, auth=self.USER) + top_folder = rsp.json() + tf_id = top_folder['id'] + return(tf_id) + + def get_docs(self, folder): + url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder + rsp = req.get(url, auth=self.USER) + docs = rsp.json()['file_refs'] + res_docs = [] + for doc in docs: + doc_id = doc['id'] + res_docs.append(doc_id) + return(res_docs) + + def download(self, doc): + url1 = self.STUDIP_DOMAIN + '/api.php/file/' + doc + rsp1 = req.get(url1, auth=self.USER) + doc_name = rsp1.json()['name'] + doc_chdate = rsp1.json()['chdate'] + last_dl = self.db.get_last_file_dl(doc) + if last_dl == None or last_dl < doc_chdate: + print('downloading ', doc_name) + url2 = self.STUDIP_DOMAIN + '/api.php/file/' + doc + '/download' + rsp2 = req.get(url2, auth=self.USER, stream=True) + total_size = int(rsp2.headers.get('content-length', 0)) + progbar = tqdm(total=total_size, unit='iB', unit_scale=True) + with open(doc_name, 'wb') as doc_file: + for chunk in rsp2.iter_content(self.CHUNK_SIZE): + progbar.update(len(chunk)) + doc_file.write(chunk) + self.db.set_last_file_dl(str(doc), str(int(time.time()))) + + def get_subdirs(self, folder): + url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder + rsp = req.get(url, auth=self.USER) + subdirs = rsp.json()['subfolders'] + docs = rsp.json()['file_refs'] + res_subdirs = {} + for subdir in subdirs: + sub_id = subdir['id'] + sub_name = subdir['name'] + res_subdirs[sub_id] = sub_name + return res_subdirs + + def download_folder(self, folder): + docs = self.get_docs(folder) + for doc in docs: + print('found doc ', doc) + self.download(doc) + + def download_folder_rec(self, folder, base_dir): + print('folder ', folder) + self.create_dir(base_dir) + self.download_folder(folder) + subdirs = self.get_subdirs(folder) + os.chdir(base_dir) + for subdir in subdirs: + subdir_name = subdirs[subdir].replace('/', '-') + subdir_path = os.path.join(base_dir, subdir_name) + print(subdir_path) + self.create_dir(subdir_path) + os.chdir(subdir_path) + self.download_folder_rec(subdir, subdir_path) + + def download_course(self, course, base_dir): + print('course ', course) + self.create_dir(base_dir) + os.chdir(base_dir) + root = self.get_top_folder(course) + self.download_folder_rec(root, base_dir) + + def download_curr_courses(self, base_dir): + print('Start downloading all current courses') + self.create_dir(base_dir) + curr_courses = self.get_curr_courses( + self.get_uid(), self.get_curr_semester()) + os.chdir(base_dir) + for course in curr_courses: + print('course is ', curr_courses[course]) + course_name = curr_courses[course].replace('/', '-') + path = os.path.join(base_dir, course_name) + self.download_course(course, path)