commit
704d4c86eb
@ -0,0 +1,2 @@
|
||||
last_dl.txt
|
||||
data
|
@ -0,0 +1,38 @@
|
||||
# Stud.IP Crawler
|
||||
|
||||
This is a program that downloads all files available for a given Stud.IP user.
|
||||
It only downloads and searches through the courses in the current semester.
|
||||
If you run the program again it only downloads files that have changed since the last time running it.
|
||||
|
||||
## Features/To-Dos
|
||||
|
||||
[x] Downloads files of given users active semester via commandline
|
||||
[x] Keeping file structure of Stud.IP
|
||||
[x] Specify username
|
||||
[x] Specify password
|
||||
[x] Specify stud.ip-url
|
||||
[x] Specify output directory
|
||||
[x] Specify chunk size to download big files
|
||||
[x] Only download files after given date
|
||||
[x] Save and read download date
|
||||
[x] Possible reset of download date
|
||||
[ ] Incremental file download
|
||||
[ ] Indexing
|
||||
|
||||
## Installation
|
||||
|
||||
- `git clone https://github.com/tiyn/studip-crawler`
|
||||
- `cd studip-crawler`
|
||||
- `pip3install -r requirements` - install dependencies
|
||||
|
||||
## Usage
|
||||
|
||||
Just run the file via `python3 crawler.py [options]`.
|
||||
Alternatively to `python3 crawler.py` you can give yourself permissions using `chmod +x crawler.py [options]` and
|
||||
run it with `./crawler.py [options]`.
|
||||
There are several options required to work.
|
||||
Run `python3 crawler.py -h` for a help menu and see which ones are important for you.
|
||||
|
||||
## Tested StudIP instances
|
||||
|
||||
- Carl von Ossietzky Universität Oldenburg
|
@ -0,0 +1,202 @@
|
||||
#!/bin/env python3
|
||||
import time
|
||||
import os
|
||||
import argparse
|
||||
|
||||
from tqdm import tqdm
|
||||
import requests as req
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
|
||||
def create_dir(dir):
|
||||
if not os.path.exists(dir):
|
||||
print('creating folder', dir)
|
||||
os.mkdir(dir)
|
||||
|
||||
|
||||
def set_last_dl(time):
|
||||
last_dl_file = open('last_dl.txt', 'w')
|
||||
last_dl_file.write(str(time).split('.')[0])
|
||||
|
||||
|
||||
def get_last_dl():
|
||||
try:
|
||||
last_dl_file = open('last_dl.txt', 'r')
|
||||
return int(last_dl_file.read())
|
||||
except:
|
||||
return None
|
||||
|
||||
parser = argparse.ArgumentParser(description='Download Files from StudIP.')
|
||||
parser.add_argument('-o', '--output', type=str,
|
||||
default='./data', help='path to output directory')
|
||||
parser.add_argument('-u', '--user', type=str, help='studip username', required=True)
|
||||
parser.add_argument('-p', '--passw', type=str, help='studip password', required=True)
|
||||
parser.add_argument('-s', '--url', type=str, help='studip url', required=True)
|
||||
parser.add_argument('-c', '--chunk', type=int, default=1024 *
|
||||
1024, help='chunksize for downloading data')
|
||||
parser.add_argument('-r', '--reset_dl_date', action='store_true')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
BASE_DIR = os.path.abspath(args.output)
|
||||
CHUNK_SIZE = args.chunk
|
||||
STUDIP_DOMAIN = args.url
|
||||
USERNAME = args.user
|
||||
PASSWORD = args.passw
|
||||
USER = (USERNAME, PASSWORD)
|
||||
if args.reset_dl_date:
|
||||
set_last_dl(None)
|
||||
LAST_DOWNLOAD = get_last_dl()
|
||||
|
||||
|
||||
def get_uid():
|
||||
url = STUDIP_DOMAIN + '/api.php/user/'
|
||||
rsp = req.get(url, auth=USER)
|
||||
user_id = rsp.json()['user_id']
|
||||
return user_id
|
||||
|
||||
|
||||
def get_curr_semester():
|
||||
url = STUDIP_DOMAIN + '/api.php/semesters/'
|
||||
rsp = req.get(url, auth=USER)
|
||||
curr_time = int(str(time.time()).split('.')[0])
|
||||
semesters = rsp.json()['collection']
|
||||
for sem_uri in semesters:
|
||||
semester = semesters[sem_uri]
|
||||
sem_begin = semester['begin']
|
||||
sem_end = semester['end']
|
||||
if sem_begin < curr_time < sem_end:
|
||||
return sem_uri
|
||||
return 0
|
||||
|
||||
|
||||
def get_ordered_semesters():
|
||||
url = STUDIP_DOMAIN + '/api.php/semesters/'
|
||||
rsp = req.get(url, auth=USER)
|
||||
semesters = rsp.json()['collection']
|
||||
order_sems = []
|
||||
for sem_uri in semesters:
|
||||
order_sems.append(sem_uri)
|
||||
return order_sems
|
||||
|
||||
|
||||
def get_curr_courses(user_id, semester):
|
||||
url = STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses'
|
||||
rsp = req.get(url, auth=USER)
|
||||
ord_sems = get_ordered_semesters()
|
||||
courses = rsp.json()['collection']
|
||||
i = 0
|
||||
course_list = {}
|
||||
for course_uri in courses:
|
||||
course = courses[course_uri]
|
||||
start_sem = course['start_semester']
|
||||
if start_sem != None:
|
||||
start_ind = ord_sems.index(start_sem)
|
||||
else:
|
||||
start_ind = 100
|
||||
end_sem = course['end_semester']
|
||||
if end_sem != None:
|
||||
end_ind = ord_sems.index(end_sem)
|
||||
else:
|
||||
end_ind = 100
|
||||
curr_ind = ord_sems.index(semester)
|
||||
if start_ind <= curr_ind <= end_ind:
|
||||
course_title = course['title']
|
||||
course_id = course['course_id']
|
||||
course_list[course_id] = course_title
|
||||
return course_list
|
||||
|
||||
|
||||
def get_top_folder(course):
|
||||
url = STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder'
|
||||
rsp = req.get(url, auth=USER)
|
||||
top_folder = rsp.json()
|
||||
tf_id = top_folder['id']
|
||||
return(tf_id)
|
||||
|
||||
|
||||
def get_docs(folder):
|
||||
url = STUDIP_DOMAIN + '/api.php/folder/' + folder
|
||||
rsp = req.get(url, auth=USER)
|
||||
docs = rsp.json()['file_refs']
|
||||
res_docs = []
|
||||
for doc in docs:
|
||||
doc_id = doc['id']
|
||||
res_docs.append(doc_id)
|
||||
return(res_docs)
|
||||
|
||||
|
||||
def download(doc, time):
|
||||
url1 = STUDIP_DOMAIN + '/api.php/file/' + doc
|
||||
rsp1 = req.get(url1, auth=USER)
|
||||
doc_name = rsp1.json()['name']
|
||||
doc_chdate = rsp1.json()['chdate']
|
||||
if time == None or time < doc_chdate:
|
||||
print('downloading ', doc_name)
|
||||
url2 = STUDIP_DOMAIN + '/api.php/file/' + doc + '/download'
|
||||
rsp2 = req.get(url2, auth=USER, stream=True)
|
||||
total_size = int(rsp2.headers.get('content-length', 0))
|
||||
progbar = tqdm(total=total_size, unit='iB', unit_scale=True)
|
||||
with open(doc_name, 'wb') as doc:
|
||||
for chunk in rsp2.iter_content(CHUNK_SIZE):
|
||||
progbar.update(len(chunk))
|
||||
doc.write(chunk)
|
||||
|
||||
|
||||
def get_subdirs(folder):
|
||||
url = STUDIP_DOMAIN + '/api.php/folder/' + folder
|
||||
rsp = req.get(url, auth=USER)
|
||||
subdirs = rsp.json()['subfolders']
|
||||
docs = rsp.json()['file_refs']
|
||||
res_subdirs = {}
|
||||
for subdir in subdirs:
|
||||
sub_id = subdir['id']
|
||||
sub_name = subdir['name']
|
||||
res_subdirs[sub_id] = sub_name
|
||||
return res_subdirs
|
||||
|
||||
|
||||
def download_folder(folder, time):
|
||||
docs = get_docs(folder)
|
||||
for doc in docs:
|
||||
print('found doc ', doc)
|
||||
download(doc, time)
|
||||
|
||||
|
||||
def download_folder_rec(folder, time, base_dir):
|
||||
print('folder ', folder)
|
||||
create_dir(base_dir)
|
||||
download_folder(folder, time)
|
||||
subdirs = get_subdirs(folder)
|
||||
os.chdir(base_dir)
|
||||
for subdir in subdirs:
|
||||
subdir_name = subdirs[subdir].replace('/', '-')
|
||||
subdir_path = os.path.join(base_dir, subdir_name)
|
||||
print(subdir_path)
|
||||
create_dir(subdir_path)
|
||||
os.chdir(subdir_path)
|
||||
download_folder_rec(subdir, time, subdir_path)
|
||||
|
||||
|
||||
def download_course(course, time, base_dir):
|
||||
print('course ', course)
|
||||
create_dir(base_dir)
|
||||
os.chdir(base_dir)
|
||||
root = get_top_folder(course)
|
||||
download_folder_rec(root, time, base_dir)
|
||||
|
||||
|
||||
def download_curr_courses(time, base_dir):
|
||||
print('Start downloading all current courses')
|
||||
create_dir(base_dir)
|
||||
curr_courses = get_curr_courses(get_uid(), get_curr_semester())
|
||||
os.chdir(base_dir)
|
||||
for course in curr_courses:
|
||||
print('course is ', curr_courses[course])
|
||||
course_name = curr_courses[course].replace('/', '-')
|
||||
path = os.path.join(base_dir, course_name)
|
||||
download_course(course, time, path)
|
||||
|
||||
|
||||
download_curr_courses(LAST_DOWNLOAD, BASE_DIR)
|
||||
set_last_dl(time.time())
|
@ -0,0 +1,2 @@
|
||||
tqdm==4.46.1
|
||||
requests==2.23.0
|
Loading…
Reference in new issue