@ -0,0 +1,2 @@ |
|||||
|
last_dl.txt |
||||
|
data |
||||
@ -0,0 +1,38 @@ |
|||||
|
# Stud.IP Crawler |
||||
|
|
||||
|
This is a program that downloads all files available for a given Stud.IP user. |
||||
|
It only downloads and searches through the courses in the current semester. |
||||
|
If you run the program again it only downloads files that have changed since the last time running it. |
||||
|
|
||||
|
## Features/To-Dos |
||||
|
|
||||
|
[x] Downloads files of given users active semester via commandline |
||||
|
[x] Keeping file structure of Stud.IP |
||||
|
[x] Specify username |
||||
|
[x] Specify password |
||||
|
[x] Specify stud.ip-url |
||||
|
[x] Specify output directory |
||||
|
[x] Specify chunk size to download big files |
||||
|
[x] Only download files after given date |
||||
|
[x] Save and read download date |
||||
|
[x] Possible reset of download date |
||||
|
[ ] Incremental file download |
||||
|
[ ] Indexing |
||||
|
|
||||
|
## Installation |
||||
|
|
||||
|
- `git clone https://github.com/tiyn/studip-crawler` |
||||
|
- `cd studip-crawler` |
||||
|
- `pip3install -r requirements` - install dependencies |
||||
|
|
||||
|
## Usage |
||||
|
|
||||
|
Just run the file via `python3 crawler.py [options]`. |
||||
|
Alternatively to `python3 crawler.py` you can give yourself permissions using `chmod +x crawler.py [options]` and |
||||
|
run it with `./crawler.py [options]`. |
||||
|
There are several options required to work. |
||||
|
Run `python3 crawler.py -h` for a help menu and see which ones are important for you. |
||||
|
|
||||
|
## Tested StudIP instances |
||||
|
|
||||
|
- Carl von Ossietzky Universität Oldenburg |
||||
@ -0,0 +1,202 @@ |
|||||
|
#!/bin/env python3 |
||||
|
import time |
||||
|
import os |
||||
|
import argparse |
||||
|
|
||||
|
from tqdm import tqdm |
||||
|
import requests as req |
||||
|
from requests.auth import HTTPBasicAuth |
||||
|
|
||||
|
|
||||
|
def create_dir(dir): |
||||
|
if not os.path.exists(dir): |
||||
|
print('creating folder', dir) |
||||
|
os.mkdir(dir) |
||||
|
|
||||
|
|
||||
|
def set_last_dl(time): |
||||
|
last_dl_file = open('last_dl.txt', 'w') |
||||
|
last_dl_file.write(str(time).split('.')[0]) |
||||
|
|
||||
|
|
||||
|
def get_last_dl(): |
||||
|
try: |
||||
|
last_dl_file = open('last_dl.txt', 'r') |
||||
|
return int(last_dl_file.read()) |
||||
|
except: |
||||
|
return None |
||||
|
|
||||
|
parser = argparse.ArgumentParser(description='Download Files from StudIP.') |
||||
|
parser.add_argument('-o', '--output', type=str, |
||||
|
default='./data', help='path to output directory') |
||||
|
parser.add_argument('-u', '--user', type=str, help='studip username', required=True) |
||||
|
parser.add_argument('-p', '--passw', type=str, help='studip password', required=True) |
||||
|
parser.add_argument('-s', '--url', type=str, help='studip url', required=True) |
||||
|
parser.add_argument('-c', '--chunk', type=int, default=1024 * |
||||
|
1024, help='chunksize for downloading data') |
||||
|
parser.add_argument('-r', '--reset_dl_date', action='store_true') |
||||
|
|
||||
|
args = parser.parse_args() |
||||
|
|
||||
|
BASE_DIR = os.path.abspath(args.output) |
||||
|
CHUNK_SIZE = args.chunk |
||||
|
STUDIP_DOMAIN = args.url |
||||
|
USERNAME = args.user |
||||
|
PASSWORD = args.passw |
||||
|
USER = (USERNAME, PASSWORD) |
||||
|
if args.reset_dl_date: |
||||
|
set_last_dl(None) |
||||
|
LAST_DOWNLOAD = get_last_dl() |
||||
|
|
||||
|
|
||||
|
def get_uid(): |
||||
|
url = STUDIP_DOMAIN + '/api.php/user/' |
||||
|
rsp = req.get(url, auth=USER) |
||||
|
user_id = rsp.json()['user_id'] |
||||
|
return user_id |
||||
|
|
||||
|
|
||||
|
def get_curr_semester(): |
||||
|
url = STUDIP_DOMAIN + '/api.php/semesters/' |
||||
|
rsp = req.get(url, auth=USER) |
||||
|
curr_time = int(str(time.time()).split('.')[0]) |
||||
|
semesters = rsp.json()['collection'] |
||||
|
for sem_uri in semesters: |
||||
|
semester = semesters[sem_uri] |
||||
|
sem_begin = semester['begin'] |
||||
|
sem_end = semester['end'] |
||||
|
if sem_begin < curr_time < sem_end: |
||||
|
return sem_uri |
||||
|
return 0 |
||||
|
|
||||
|
|
||||
|
def get_ordered_semesters(): |
||||
|
url = STUDIP_DOMAIN + '/api.php/semesters/' |
||||
|
rsp = req.get(url, auth=USER) |
||||
|
semesters = rsp.json()['collection'] |
||||
|
order_sems = [] |
||||
|
for sem_uri in semesters: |
||||
|
order_sems.append(sem_uri) |
||||
|
return order_sems |
||||
|
|
||||
|
|
||||
|
def get_curr_courses(user_id, semester): |
||||
|
url = STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses' |
||||
|
rsp = req.get(url, auth=USER) |
||||
|
ord_sems = get_ordered_semesters() |
||||
|
courses = rsp.json()['collection'] |
||||
|
i = 0 |
||||
|
course_list = {} |
||||
|
for course_uri in courses: |
||||
|
course = courses[course_uri] |
||||
|
start_sem = course['start_semester'] |
||||
|
if start_sem != None: |
||||
|
start_ind = ord_sems.index(start_sem) |
||||
|
else: |
||||
|
start_ind = 100 |
||||
|
end_sem = course['end_semester'] |
||||
|
if end_sem != None: |
||||
|
end_ind = ord_sems.index(end_sem) |
||||
|
else: |
||||
|
end_ind = 100 |
||||
|
curr_ind = ord_sems.index(semester) |
||||
|
if start_ind <= curr_ind <= end_ind: |
||||
|
course_title = course['title'] |
||||
|
course_id = course['course_id'] |
||||
|
course_list[course_id] = course_title |
||||
|
return course_list |
||||
|
|
||||
|
|
||||
|
def get_top_folder(course): |
||||
|
url = STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder' |
||||
|
rsp = req.get(url, auth=USER) |
||||
|
top_folder = rsp.json() |
||||
|
tf_id = top_folder['id'] |
||||
|
return(tf_id) |
||||
|
|
||||
|
|
||||
|
def get_docs(folder): |
||||
|
url = STUDIP_DOMAIN + '/api.php/folder/' + folder |
||||
|
rsp = req.get(url, auth=USER) |
||||
|
docs = rsp.json()['file_refs'] |
||||
|
res_docs = [] |
||||
|
for doc in docs: |
||||
|
doc_id = doc['id'] |
||||
|
res_docs.append(doc_id) |
||||
|
return(res_docs) |
||||
|
|
||||
|
|
||||
|
def download(doc, time): |
||||
|
url1 = STUDIP_DOMAIN + '/api.php/file/' + doc |
||||
|
rsp1 = req.get(url1, auth=USER) |
||||
|
doc_name = rsp1.json()['name'] |
||||
|
doc_chdate = rsp1.json()['chdate'] |
||||
|
if time == None or time < doc_chdate: |
||||
|
print('downloading ', doc_name) |
||||
|
url2 = STUDIP_DOMAIN + '/api.php/file/' + doc + '/download' |
||||
|
rsp2 = req.get(url2, auth=USER, stream=True) |
||||
|
total_size = int(rsp2.headers.get('content-length', 0)) |
||||
|
progbar = tqdm(total=total_size, unit='iB', unit_scale=True) |
||||
|
with open(doc_name, 'wb') as doc: |
||||
|
for chunk in rsp2.iter_content(CHUNK_SIZE): |
||||
|
progbar.update(len(chunk)) |
||||
|
doc.write(chunk) |
||||
|
|
||||
|
|
||||
|
def get_subdirs(folder): |
||||
|
url = STUDIP_DOMAIN + '/api.php/folder/' + folder |
||||
|
rsp = req.get(url, auth=USER) |
||||
|
subdirs = rsp.json()['subfolders'] |
||||
|
docs = rsp.json()['file_refs'] |
||||
|
res_subdirs = {} |
||||
|
for subdir in subdirs: |
||||
|
sub_id = subdir['id'] |
||||
|
sub_name = subdir['name'] |
||||
|
res_subdirs[sub_id] = sub_name |
||||
|
return res_subdirs |
||||
|
|
||||
|
|
||||
|
def download_folder(folder, time): |
||||
|
docs = get_docs(folder) |
||||
|
for doc in docs: |
||||
|
print('found doc ', doc) |
||||
|
download(doc, time) |
||||
|
|
||||
|
|
||||
|
def download_folder_rec(folder, time, base_dir): |
||||
|
print('folder ', folder) |
||||
|
create_dir(base_dir) |
||||
|
download_folder(folder, time) |
||||
|
subdirs = get_subdirs(folder) |
||||
|
os.chdir(base_dir) |
||||
|
for subdir in subdirs: |
||||
|
subdir_name = subdirs[subdir].replace('/', '-') |
||||
|
subdir_path = os.path.join(base_dir, subdir_name) |
||||
|
print(subdir_path) |
||||
|
create_dir(subdir_path) |
||||
|
os.chdir(subdir_path) |
||||
|
download_folder_rec(subdir, time, subdir_path) |
||||
|
|
||||
|
|
||||
|
def download_course(course, time, base_dir): |
||||
|
print('course ', course) |
||||
|
create_dir(base_dir) |
||||
|
os.chdir(base_dir) |
||||
|
root = get_top_folder(course) |
||||
|
download_folder_rec(root, time, base_dir) |
||||
|
|
||||
|
|
||||
|
def download_curr_courses(time, base_dir): |
||||
|
print('Start downloading all current courses') |
||||
|
create_dir(base_dir) |
||||
|
curr_courses = get_curr_courses(get_uid(), get_curr_semester()) |
||||
|
os.chdir(base_dir) |
||||
|
for course in curr_courses: |
||||
|
print('course is ', curr_courses[course]) |
||||
|
course_name = curr_courses[course].replace('/', '-') |
||||
|
path = os.path.join(base_dir, course_name) |
||||
|
download_course(course, time, path) |
||||
|
|
||||
|
|
||||
|
download_curr_courses(LAST_DOWNLOAD, BASE_DIR) |
||||
|
set_last_dl(time.time()) |
||||
@ -0,0 +1,2 @@ |
|||||
|
tqdm==4.46.1 |
||||
|
requests==2.23.0 |
||||