mirror of
				https://github.com/tiyn/stud.ip-crawler.git
				synced 2025-10-25 09:11:18 +02:00 
			
		
		
		
	initial commit
This commit is contained in:
		
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | ||||
| last_dl.txt | ||||
| data | ||||
							
								
								
									
										38
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								README.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| # Stud.IP Crawler | ||||
|  | ||||
| This is a program that downloads all files available for a given Stud.IP user. | ||||
| It only downloads and searches through the courses in the current semester. | ||||
| If you run the program again it only downloads files that have changed since the last time running it. | ||||
|  | ||||
| ## Features/To-Dos | ||||
|  | ||||
| [x] Downloads files of given users active semester via commandline | ||||
|     [x] Keeping file structure of Stud.IP | ||||
|     [x] Specify username | ||||
|     [x] Specify password | ||||
|     [x] Specify stud.ip-url | ||||
|     [x] Specify output directory | ||||
|     [x] Specify chunk size to download big files | ||||
| [x] Only download files after given date | ||||
|     [x] Save and read download date | ||||
|     [x] Possible reset of download date | ||||
| [ ] Incremental file download | ||||
|     [ ] Indexing | ||||
|  | ||||
| ## Installation | ||||
|  | ||||
| - `git clone https://github.com/tiyn/studip-crawler` | ||||
| - `cd studip-crawler` | ||||
| - `pip3install -r requirements` - install dependencies | ||||
|  | ||||
| ## Usage | ||||
|  | ||||
| Just run the file via `python3 crawler.py [options]`. | ||||
| Alternatively to `python3 crawler.py` you can give yourself permissions using `chmod +x crawler.py [options]` and | ||||
| run it with `./crawler.py [options]`. | ||||
| There are several options required to work. | ||||
| Run `python3 crawler.py -h` for a help menu and see which ones are important for you. | ||||
|  | ||||
| ## Tested StudIP instances | ||||
|  | ||||
| - Carl von Ossietzky Universität Oldenburg | ||||
							
								
								
									
										202
									
								
								crawler.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										202
									
								
								crawler.py
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,202 @@ | ||||
| #!/bin/env python3 | ||||
| import time | ||||
| import os | ||||
| import argparse | ||||
|  | ||||
| from tqdm import tqdm | ||||
| import requests as req | ||||
| from requests.auth import HTTPBasicAuth | ||||
|  | ||||
|  | ||||
| def create_dir(dir): | ||||
|     if not os.path.exists(dir): | ||||
|         print('creating folder', dir) | ||||
|         os.mkdir(dir) | ||||
|  | ||||
|  | ||||
| def set_last_dl(time): | ||||
|     last_dl_file = open('last_dl.txt', 'w') | ||||
|     last_dl_file.write(str(time).split('.')[0]) | ||||
|  | ||||
|  | ||||
| def get_last_dl(): | ||||
|     try: | ||||
|         last_dl_file = open('last_dl.txt', 'r') | ||||
|         return int(last_dl_file.read()) | ||||
|     except: | ||||
|         return None | ||||
|  | ||||
| parser = argparse.ArgumentParser(description='Download Files from StudIP.') | ||||
| parser.add_argument('-o', '--output', type=str, | ||||
|                     default='./data', help='path to output directory') | ||||
| parser.add_argument('-u', '--user', type=str, help='studip username', required=True) | ||||
| parser.add_argument('-p', '--passw', type=str, help='studip password', required=True) | ||||
| parser.add_argument('-s', '--url', type=str, help='studip url', required=True) | ||||
| parser.add_argument('-c', '--chunk', type=int, default=1024 * | ||||
|                     1024, help='chunksize for downloading data') | ||||
| parser.add_argument('-r', '--reset_dl_date', action='store_true') | ||||
|  | ||||
| args = parser.parse_args() | ||||
|  | ||||
| BASE_DIR = os.path.abspath(args.output) | ||||
| CHUNK_SIZE = args.chunk | ||||
| STUDIP_DOMAIN = args.url | ||||
| USERNAME = args.user | ||||
| PASSWORD = args.passw | ||||
| USER = (USERNAME, PASSWORD) | ||||
| if args.reset_dl_date: | ||||
|     set_last_dl(None) | ||||
| LAST_DOWNLOAD = get_last_dl() | ||||
|  | ||||
|  | ||||
| def get_uid(): | ||||
|     url = STUDIP_DOMAIN + '/api.php/user/' | ||||
|     rsp = req.get(url, auth=USER) | ||||
|     user_id = rsp.json()['user_id'] | ||||
|     return user_id | ||||
|  | ||||
|  | ||||
| def get_curr_semester(): | ||||
|     url = STUDIP_DOMAIN + '/api.php/semesters/' | ||||
|     rsp = req.get(url, auth=USER) | ||||
|     curr_time = int(str(time.time()).split('.')[0]) | ||||
|     semesters = rsp.json()['collection'] | ||||
|     for sem_uri in semesters: | ||||
|         semester = semesters[sem_uri] | ||||
|         sem_begin = semester['begin'] | ||||
|         sem_end = semester['end'] | ||||
|         if sem_begin < curr_time < sem_end: | ||||
|             return sem_uri | ||||
|     return 0 | ||||
|  | ||||
|  | ||||
| def get_ordered_semesters(): | ||||
|     url = STUDIP_DOMAIN + '/api.php/semesters/' | ||||
|     rsp = req.get(url, auth=USER) | ||||
|     semesters = rsp.json()['collection'] | ||||
|     order_sems = [] | ||||
|     for sem_uri in semesters: | ||||
|         order_sems.append(sem_uri) | ||||
|     return order_sems | ||||
|  | ||||
|  | ||||
| def get_curr_courses(user_id, semester): | ||||
|     url = STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses' | ||||
|     rsp = req.get(url, auth=USER) | ||||
|     ord_sems = get_ordered_semesters() | ||||
|     courses = rsp.json()['collection'] | ||||
|     i = 0 | ||||
|     course_list = {} | ||||
|     for course_uri in courses: | ||||
|         course = courses[course_uri] | ||||
|         start_sem = course['start_semester'] | ||||
|         if start_sem != None: | ||||
|             start_ind = ord_sems.index(start_sem) | ||||
|         else: | ||||
|             start_ind = 100 | ||||
|         end_sem = course['end_semester'] | ||||
|         if end_sem != None: | ||||
|             end_ind = ord_sems.index(end_sem) | ||||
|         else: | ||||
|             end_ind = 100 | ||||
|         curr_ind = ord_sems.index(semester) | ||||
|         if start_ind <= curr_ind <= end_ind: | ||||
|             course_title = course['title'] | ||||
|             course_id = course['course_id'] | ||||
|             course_list[course_id] = course_title | ||||
|     return course_list | ||||
|  | ||||
|  | ||||
| def get_top_folder(course): | ||||
|     url = STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder' | ||||
|     rsp = req.get(url, auth=USER) | ||||
|     top_folder = rsp.json() | ||||
|     tf_id = top_folder['id'] | ||||
|     return(tf_id) | ||||
|  | ||||
|  | ||||
| def get_docs(folder): | ||||
|     url = STUDIP_DOMAIN + '/api.php/folder/' + folder | ||||
|     rsp = req.get(url, auth=USER) | ||||
|     docs = rsp.json()['file_refs'] | ||||
|     res_docs = [] | ||||
|     for doc in docs: | ||||
|         doc_id = doc['id'] | ||||
|         res_docs.append(doc_id) | ||||
|     return(res_docs) | ||||
|  | ||||
|  | ||||
| def download(doc, time): | ||||
|     url1 = STUDIP_DOMAIN + '/api.php/file/' + doc | ||||
|     rsp1 = req.get(url1, auth=USER) | ||||
|     doc_name = rsp1.json()['name'] | ||||
|     doc_chdate = rsp1.json()['chdate'] | ||||
|     if time == None or time < doc_chdate: | ||||
|         print('downloading ', doc_name) | ||||
|         url2 = STUDIP_DOMAIN + '/api.php/file/' + doc + '/download' | ||||
|         rsp2 = req.get(url2, auth=USER, stream=True) | ||||
|         total_size = int(rsp2.headers.get('content-length', 0)) | ||||
|         progbar = tqdm(total=total_size, unit='iB', unit_scale=True) | ||||
|         with open(doc_name, 'wb') as doc: | ||||
|             for chunk in rsp2.iter_content(CHUNK_SIZE): | ||||
|                 progbar.update(len(chunk)) | ||||
|                 doc.write(chunk) | ||||
|  | ||||
|  | ||||
| def get_subdirs(folder): | ||||
|     url = STUDIP_DOMAIN + '/api.php/folder/' + folder | ||||
|     rsp = req.get(url, auth=USER) | ||||
|     subdirs = rsp.json()['subfolders'] | ||||
|     docs = rsp.json()['file_refs'] | ||||
|     res_subdirs = {} | ||||
|     for subdir in subdirs: | ||||
|         sub_id = subdir['id'] | ||||
|         sub_name = subdir['name'] | ||||
|         res_subdirs[sub_id] = sub_name | ||||
|     return res_subdirs | ||||
|  | ||||
|  | ||||
| def download_folder(folder, time): | ||||
|     docs = get_docs(folder) | ||||
|     for doc in docs: | ||||
|         print('found doc ', doc) | ||||
|         download(doc, time) | ||||
|  | ||||
|  | ||||
| def download_folder_rec(folder, time, base_dir): | ||||
|     print('folder ', folder) | ||||
|     create_dir(base_dir) | ||||
|     download_folder(folder, time) | ||||
|     subdirs = get_subdirs(folder) | ||||
|     os.chdir(base_dir) | ||||
|     for subdir in subdirs: | ||||
|         subdir_name = subdirs[subdir].replace('/', '-') | ||||
|         subdir_path = os.path.join(base_dir, subdir_name) | ||||
|         print(subdir_path) | ||||
|         create_dir(subdir_path) | ||||
|         os.chdir(subdir_path) | ||||
|         download_folder_rec(subdir, time, subdir_path) | ||||
|  | ||||
|  | ||||
| def download_course(course, time, base_dir): | ||||
|     print('course ', course) | ||||
|     create_dir(base_dir) | ||||
|     os.chdir(base_dir) | ||||
|     root = get_top_folder(course) | ||||
|     download_folder_rec(root, time, base_dir) | ||||
|  | ||||
|  | ||||
| def download_curr_courses(time, base_dir): | ||||
|     print('Start downloading all current courses') | ||||
|     create_dir(base_dir) | ||||
|     curr_courses = get_curr_courses(get_uid(), get_curr_semester()) | ||||
|     os.chdir(base_dir) | ||||
|     for course in curr_courses: | ||||
|         print('course is ', curr_courses[course]) | ||||
|         course_name = curr_courses[course].replace('/', '-') | ||||
|         path = os.path.join(base_dir, course_name) | ||||
|         download_course(course, time, path) | ||||
|  | ||||
|  | ||||
| download_curr_courses(LAST_DOWNLOAD, BASE_DIR) | ||||
| set_last_dl(time.time()) | ||||
							
								
								
									
										2
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | ||||
| tqdm==4.46.1 | ||||
| requests==2.23.0 | ||||
		Reference in New Issue
	
	Block a user