From 704d4c86eba380eb1bdfcbb5015ac75ce33f31c4 Mon Sep 17 00:00:00 2001 From: TiynGER Date: Fri, 5 Jun 2020 19:47:36 +0200 Subject: [PATCH] initial commit --- .gitignore | 2 + README.md | 38 +++++++++ crawler.py | 202 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 + 4 files changed, 244 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100755 crawler.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ac36c29 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +last_dl.txt +data diff --git a/README.md b/README.md new file mode 100644 index 0000000..994e908 --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# Stud.IP Crawler + +This is a program that downloads all files available for a given Stud.IP user. +It only downloads and searches through the courses in the current semester. +If you run the program again it only downloads files that have changed since the last time running it. + +## Features/To-Dos + +[x] Downloads files of given users active semester via commandline + [x] Keeping file structure of Stud.IP + [x] Specify username + [x] Specify password + [x] Specify stud.ip-url + [x] Specify output directory + [x] Specify chunk size to download big files +[x] Only download files after given date + [x] Save and read download date + [x] Possible reset of download date +[ ] Incremental file download + [ ] Indexing + +## Installation + +- `git clone https://github.com/tiyn/studip-crawler` +- `cd studip-crawler` +- `pip3install -r requirements` - install dependencies + +## Usage + +Just run the file via `python3 crawler.py [options]`. +Alternatively to `python3 crawler.py` you can give yourself permissions using `chmod +x crawler.py [options]` and +run it with `./crawler.py [options]`. +There are several options required to work. +Run `python3 crawler.py -h` for a help menu and see which ones are important for you. + +## Tested StudIP instances + +- Carl von Ossietzky Universität Oldenburg diff --git a/crawler.py b/crawler.py new file mode 100755 index 0000000..ef64e9b --- /dev/null +++ b/crawler.py @@ -0,0 +1,202 @@ +#!/bin/env python3 +import time +import os +import argparse + +from tqdm import tqdm +import requests as req +from requests.auth import HTTPBasicAuth + + +def create_dir(dir): + if not os.path.exists(dir): + print('creating folder', dir) + os.mkdir(dir) + + +def set_last_dl(time): + last_dl_file = open('last_dl.txt', 'w') + last_dl_file.write(str(time).split('.')[0]) + + +def get_last_dl(): + try: + last_dl_file = open('last_dl.txt', 'r') + return int(last_dl_file.read()) + except: + return None + +parser = argparse.ArgumentParser(description='Download Files from StudIP.') +parser.add_argument('-o', '--output', type=str, + default='./data', help='path to output directory') +parser.add_argument('-u', '--user', type=str, help='studip username', required=True) +parser.add_argument('-p', '--passw', type=str, help='studip password', required=True) +parser.add_argument('-s', '--url', type=str, help='studip url', required=True) +parser.add_argument('-c', '--chunk', type=int, default=1024 * + 1024, help='chunksize for downloading data') +parser.add_argument('-r', '--reset_dl_date', action='store_true') + +args = parser.parse_args() + +BASE_DIR = os.path.abspath(args.output) +CHUNK_SIZE = args.chunk +STUDIP_DOMAIN = args.url +USERNAME = args.user +PASSWORD = args.passw +USER = (USERNAME, PASSWORD) +if args.reset_dl_date: + set_last_dl(None) +LAST_DOWNLOAD = get_last_dl() + + +def get_uid(): + url = STUDIP_DOMAIN + '/api.php/user/' + rsp = req.get(url, auth=USER) + user_id = rsp.json()['user_id'] + return user_id + + +def get_curr_semester(): + url = STUDIP_DOMAIN + '/api.php/semesters/' + rsp = req.get(url, auth=USER) + curr_time = int(str(time.time()).split('.')[0]) + semesters = rsp.json()['collection'] + for sem_uri in semesters: + semester = semesters[sem_uri] + sem_begin = semester['begin'] + sem_end = semester['end'] + if sem_begin < curr_time < sem_end: + return sem_uri + return 0 + + +def get_ordered_semesters(): + url = STUDIP_DOMAIN + '/api.php/semesters/' + rsp = req.get(url, auth=USER) + semesters = rsp.json()['collection'] + order_sems = [] + for sem_uri in semesters: + order_sems.append(sem_uri) + return order_sems + + +def get_curr_courses(user_id, semester): + url = STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses' + rsp = req.get(url, auth=USER) + ord_sems = get_ordered_semesters() + courses = rsp.json()['collection'] + i = 0 + course_list = {} + for course_uri in courses: + course = courses[course_uri] + start_sem = course['start_semester'] + if start_sem != None: + start_ind = ord_sems.index(start_sem) + else: + start_ind = 100 + end_sem = course['end_semester'] + if end_sem != None: + end_ind = ord_sems.index(end_sem) + else: + end_ind = 100 + curr_ind = ord_sems.index(semester) + if start_ind <= curr_ind <= end_ind: + course_title = course['title'] + course_id = course['course_id'] + course_list[course_id] = course_title + return course_list + + +def get_top_folder(course): + url = STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder' + rsp = req.get(url, auth=USER) + top_folder = rsp.json() + tf_id = top_folder['id'] + return(tf_id) + + +def get_docs(folder): + url = STUDIP_DOMAIN + '/api.php/folder/' + folder + rsp = req.get(url, auth=USER) + docs = rsp.json()['file_refs'] + res_docs = [] + for doc in docs: + doc_id = doc['id'] + res_docs.append(doc_id) + return(res_docs) + + +def download(doc, time): + url1 = STUDIP_DOMAIN + '/api.php/file/' + doc + rsp1 = req.get(url1, auth=USER) + doc_name = rsp1.json()['name'] + doc_chdate = rsp1.json()['chdate'] + if time == None or time < doc_chdate: + print('downloading ', doc_name) + url2 = STUDIP_DOMAIN + '/api.php/file/' + doc + '/download' + rsp2 = req.get(url2, auth=USER, stream=True) + total_size = int(rsp2.headers.get('content-length', 0)) + progbar = tqdm(total=total_size, unit='iB', unit_scale=True) + with open(doc_name, 'wb') as doc: + for chunk in rsp2.iter_content(CHUNK_SIZE): + progbar.update(len(chunk)) + doc.write(chunk) + + +def get_subdirs(folder): + url = STUDIP_DOMAIN + '/api.php/folder/' + folder + rsp = req.get(url, auth=USER) + subdirs = rsp.json()['subfolders'] + docs = rsp.json()['file_refs'] + res_subdirs = {} + for subdir in subdirs: + sub_id = subdir['id'] + sub_name = subdir['name'] + res_subdirs[sub_id] = sub_name + return res_subdirs + + +def download_folder(folder, time): + docs = get_docs(folder) + for doc in docs: + print('found doc ', doc) + download(doc, time) + + +def download_folder_rec(folder, time, base_dir): + print('folder ', folder) + create_dir(base_dir) + download_folder(folder, time) + subdirs = get_subdirs(folder) + os.chdir(base_dir) + for subdir in subdirs: + subdir_name = subdirs[subdir].replace('/', '-') + subdir_path = os.path.join(base_dir, subdir_name) + print(subdir_path) + create_dir(subdir_path) + os.chdir(subdir_path) + download_folder_rec(subdir, time, subdir_path) + + +def download_course(course, time, base_dir): + print('course ', course) + create_dir(base_dir) + os.chdir(base_dir) + root = get_top_folder(course) + download_folder_rec(root, time, base_dir) + + +def download_curr_courses(time, base_dir): + print('Start downloading all current courses') + create_dir(base_dir) + curr_courses = get_curr_courses(get_uid(), get_curr_semester()) + os.chdir(base_dir) + for course in curr_courses: + print('course is ', curr_courses[course]) + course_name = curr_courses[course].replace('/', '-') + path = os.path.join(base_dir, course_name) + download_course(course, time, path) + + +download_curr_courses(LAST_DOWNLOAD, BASE_DIR) +set_last_dl(time.time()) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..15918fe --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +tqdm==4.46.1 +requests==2.23.0