mirror of
https://github.com/tiyn/stud.ip-crawler.git
synced 2026-02-22 06:34:48 +01:00
The log has options for several levels that can be set from the command line. The file is hardcoded as log.txt and can be toggled
55 lines
1.8 KiB
Python
55 lines
1.8 KiB
Python
import os
|
|
import logging as log
|
|
|
|
from studip import Studip
|
|
|
|
|
|
class Crawler:
|
|
|
|
def __init__(self, studip):
|
|
self.studip = studip
|
|
|
|
def download_folder(self, folder):
|
|
docs = self.studip.get_docs(folder)
|
|
for doc in docs:
|
|
log.info('found doc ' + doc)
|
|
self.studip.download(doc)
|
|
|
|
def download_folder_rec(self, folder, base_dir):
|
|
log.info('crawling folder ' + folder)
|
|
self.create_dir(base_dir)
|
|
self.download_folder(folder)
|
|
subdirs = self.studip.get_subdirs(folder)
|
|
os.chdir(base_dir)
|
|
for subdir in subdirs:
|
|
subdir_name = subdirs[subdir].replace('/', '-')
|
|
subdir_path = os.path.join(base_dir, subdir_name)
|
|
log.debug(subdir_path)
|
|
self.create_dir(subdir_path)
|
|
os.chdir(subdir_path)
|
|
self.download_folder_rec(subdir, subdir_path)
|
|
|
|
def download_course(self, course, base_dir):
|
|
log.info('crawling course ' + course)
|
|
self.create_dir(base_dir)
|
|
os.chdir(base_dir)
|
|
root = self.studip.get_top_folder(course)
|
|
self.download_folder_rec(root, base_dir)
|
|
|
|
def download_curr_courses(self, base_dir):
|
|
log.info('Start crawling all current courses')
|
|
self.create_dir(base_dir)
|
|
curr_courses = self.studip.get_curr_courses(
|
|
self.studip.get_uid(), self.studip.get_curr_semester())
|
|
os.chdir(base_dir)
|
|
for course in curr_courses:
|
|
log.debug('course is ' + curr_courses[course])
|
|
course_name = curr_courses[course].replace('/', '-')
|
|
path = os.path.join(base_dir, course_name)
|
|
self.download_course(course, path)
|
|
|
|
def create_dir(self, dir):
|
|
if not os.path.exists(dir):
|
|
log.info('creating folder' + dir)
|
|
os.mkdir(dir)
|