initial commit

2025-11-19 03:29:46 +01:00 · 2020-06-05 19:47:36 +02:00
commit 704d4c86eb
4 changed files with 244 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
 last_dl.txt
 data
--- a/README.md
+++ b/README.md
@@ -0,0 +1,38 @@
 # Stud.IP Crawler
 This is a program that downloads all files available for a given Stud.IP user.
 It only downloads and searches through the courses in the current semester.
 If you run the program again it only downloads files that have changed since the last time running it.
 ## Features/To-Dos
 [x] Downloads files of given users active semester via commandline
    [x] Keeping file structure of Stud.IP
    [x] Specify username
    [x] Specify password
    [x] Specify stud.ip-url
    [x] Specify output directory
    [x] Specify chunk size to download big files
 [x] Only download files after given date
    [x] Save and read download date
    [x] Possible reset of download date
 [ ] Incremental file download
    [ ] Indexing
 ## Installation
 - `git clone https://github.com/tiyn/studip-crawler`
 - `cd studip-crawler`
 - `pip3install -r requirements` - install dependencies
 ## Usage
 Just run the file via `python3 crawler.py [options]`.
 Alternatively to `python3 crawler.py` you can give yourself permissions using `chmod +x crawler.py [options]` and
 run it with `./crawler.py [options]`.
 There are several options required to work.
 Run `python3 crawler.py -h` for a help menu and see which ones are important for you.
 ## Tested StudIP instances
 - Carl von Ossietzky Universität Oldenburg
--- a/crawler.py
+++ b/crawler.py
@@ -0,0 +1,202 @@
 #!/bin/env python3
 import time
 import os
 import argparse
 from tqdm import tqdm
 import requests as req
 from requests.auth import HTTPBasicAuth
 def create_dir(dir):
    if not os.path.exists(dir):
        print('creating folder', dir)
        os.mkdir(dir)
 def set_last_dl(time):
    last_dl_file = open('last_dl.txt', 'w')
    last_dl_file.write(str(time).split('.')[0])
 def get_last_dl():
    try:
        last_dl_file = open('last_dl.txt', 'r')
        return int(last_dl_file.read())
    except:
        return None
 parser = argparse.ArgumentParser(description='Download Files from StudIP.')
 parser.add_argument('-o', '--output', type=str,
                    default='./data', help='path to output directory')
 parser.add_argument('-u', '--user', type=str, help='studip username', required=True)
 parser.add_argument('-p', '--passw', type=str, help='studip password', required=True)
 parser.add_argument('-s', '--url', type=str, help='studip url', required=True)
 parser.add_argument('-c', '--chunk', type=int, default=1024 *
                    1024, help='chunksize for downloading data')
 parser.add_argument('-r', '--reset_dl_date', action='store_true')
 args = parser.parse_args()
 BASE_DIR = os.path.abspath(args.output)
 CHUNK_SIZE = args.chunk
 STUDIP_DOMAIN = args.url
 USERNAME = args.user
 PASSWORD = args.passw
 USER = (USERNAME, PASSWORD)
 if args.reset_dl_date:
    set_last_dl(None)
 LAST_DOWNLOAD = get_last_dl()
 def get_uid():
    url = STUDIP_DOMAIN + '/api.php/user/'
    rsp = req.get(url, auth=USER)
    user_id = rsp.json()['user_id']
    return user_id
 def get_curr_semester():
    url = STUDIP_DOMAIN + '/api.php/semesters/'
    rsp = req.get(url, auth=USER)
    curr_time = int(str(time.time()).split('.')[0])
    semesters = rsp.json()['collection']
    for sem_uri in semesters:
        semester = semesters[sem_uri]
        sem_begin = semester['begin']
        sem_end = semester['end']
        if sem_begin < curr_time < sem_end:
            return sem_uri
    return 0
 def get_ordered_semesters():
    url = STUDIP_DOMAIN + '/api.php/semesters/'
    rsp = req.get(url, auth=USER)
    semesters = rsp.json()['collection']
    order_sems = []
    for sem_uri in semesters:
        order_sems.append(sem_uri)
    return order_sems
 def get_curr_courses(user_id, semester):
    url = STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses'
    rsp = req.get(url, auth=USER)
    ord_sems = get_ordered_semesters()
    courses = rsp.json()['collection']
    i = 0
    course_list = {}
    for course_uri in courses:
        course = courses[course_uri]
        start_sem = course['start_semester']
        if start_sem != None:
            start_ind = ord_sems.index(start_sem)
        else:
            start_ind = 100
        end_sem = course['end_semester']
        if end_sem != None:
            end_ind = ord_sems.index(end_sem)
        else:
            end_ind = 100
        curr_ind = ord_sems.index(semester)
        if start_ind <= curr_ind <= end_ind:
            course_title = course['title']
            course_id = course['course_id']
            course_list[course_id] = course_title
    return course_list
 def get_top_folder(course):
    url = STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder'
    rsp = req.get(url, auth=USER)
    top_folder = rsp.json()
    tf_id = top_folder['id']
    return(tf_id)
 def get_docs(folder):
    url = STUDIP_DOMAIN + '/api.php/folder/' + folder
    rsp = req.get(url, auth=USER)
    docs = rsp.json()['file_refs']
    res_docs = []
    for doc in docs:
        doc_id = doc['id']
        res_docs.append(doc_id)
    return(res_docs)
 def download(doc, time):
    url1 = STUDIP_DOMAIN + '/api.php/file/' + doc
    rsp1 = req.get(url1, auth=USER)
    doc_name = rsp1.json()['name']
    doc_chdate = rsp1.json()['chdate']
    if time == None or time < doc_chdate:
        print('downloading ', doc_name)
        url2 = STUDIP_DOMAIN + '/api.php/file/' + doc + '/download'
        rsp2 = req.get(url2, auth=USER, stream=True)
        total_size = int(rsp2.headers.get('content-length', 0))
        progbar = tqdm(total=total_size, unit='iB', unit_scale=True)
        with open(doc_name, 'wb') as doc:
            for chunk in rsp2.iter_content(CHUNK_SIZE):
                progbar.update(len(chunk))
                doc.write(chunk)
 def get_subdirs(folder):
    url = STUDIP_DOMAIN + '/api.php/folder/' + folder
    rsp = req.get(url, auth=USER)
    subdirs = rsp.json()['subfolders']
    docs = rsp.json()['file_refs']
    res_subdirs = {}
    for subdir in subdirs:
        sub_id = subdir['id']
        sub_name = subdir['name']
        res_subdirs[sub_id] = sub_name
    return res_subdirs
 def download_folder(folder, time):
    docs = get_docs(folder)
    for doc in docs:
        print('found doc ', doc)
        download(doc, time)
 def download_folder_rec(folder, time, base_dir):
    print('folder ', folder)
    create_dir(base_dir)
    download_folder(folder, time)
    subdirs = get_subdirs(folder)
    os.chdir(base_dir)
    for subdir in subdirs:
        subdir_name = subdirs[subdir].replace('/', '-')
        subdir_path = os.path.join(base_dir, subdir_name)
        print(subdir_path)
        create_dir(subdir_path)
        os.chdir(subdir_path)
        download_folder_rec(subdir, time, subdir_path)
 def download_course(course, time, base_dir):
    print('course ', course)
    create_dir(base_dir)
    os.chdir(base_dir)
    root = get_top_folder(course)
    download_folder_rec(root, time, base_dir)
 def download_curr_courses(time, base_dir):
    print('Start downloading all current courses')
    create_dir(base_dir)
    curr_courses = get_curr_courses(get_uid(), get_curr_semester())
    os.chdir(base_dir)
    for course in curr_courses:
        print('course is ', curr_courses[course])
        course_name = curr_courses[course].replace('/', '-')
        path = os.path.join(base_dir, course_name)
        download_course(course, time, path)
 download_curr_courses(LAST_DOWNLOAD, BASE_DIR)
 set_last_dl(time.time())
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
 tqdm==4.46.1
 requests==2.23.0