1
0
mirror of https://github.com/tiyn/stud.ip-crawler.git synced 2026-02-22 06:34:48 +01:00
Files
stud.ip-crawler/src/studip.py
TiynGER 4a8cf45ad3 database: switching from mysql to sqlite
Currently we only need the db to keep track of the files that were
already downloaded, thus we only use one database and only one table.
A complete sql database is a bit too bulky for this purpose.
By using sqlite we can minimize and embed the db.
2020-11-17 11:04:03 +01:00

181 lines
5.5 KiB
Python
Executable File

import time
import logging as log
#from tqdm import tqdm
import requests as req
from requests.auth import HTTPBasicAuth
import json
class Studip:
def __init__(self, chunk_size, domain, user, db):
self.CHUNK_SIZE = chunk_size
self.DOMAIN = domain
self.USER = user
self.db = db
def auth_req(self, url):
"""Creates a request for a user.
Parameter:
url(string): URL to send the request to
Returns:
string: request
"""
url = self.DOMAIN + url
return req.get(url, auth=self.USER)
def get_uid(self):
"""Get the user id of the user specified in the object.
Returns:
string: user id
"""
rsp = self.auth_req('/api.php/user/')
user_id = rsp.json()['user_id']
return user_id
def get_curr_semester(self):
"""Get the current semester of the studip instance specified in the object.
Returns:
string: id for current semester
"""
rsp = self.auth_req('/api.php/semesters/')
curr_time = int(str(int(time.time())))
semesters = rsp.json()['collection']
for sem_uri in semesters:
semester = semesters[sem_uri]
sem_begin = semester['begin']
sem_end = semester['end']
if sem_begin < curr_time < sem_end:
return sem_uri
return 0
def get_ordered_semesters(self):
"""Get the a list of semesters of studip instance specified in the object.
Returns:
list(string): all semesters of the user
"""
rsp = self.auth_req('/api.php/semesters/')
semesters = rsp.json()['collection']
order_sems = []
for sem_uri in semesters:
order_sems.append(sem_uri)
return order_sems
def get_curr_courses(self, user_id, semester):
"""Get the a list of semesters of studip instance specified in the object.
Returns:
string: id of the current semester
"""
rsp = self.auth_req('/api.php/user/' + user_id + '/courses?limit=1000')
ord_sems = self.get_ordered_semesters()
courses = rsp.json()['collection']
course_list = {}
for course_uri in courses:
course = courses[course_uri]
start_sem = course['start_semester']
if start_sem != None:
start_ind = ord_sems.index(start_sem)
else:
start_ind = 100
end_sem = course['end_semester']
if end_sem != None:
end_ind = ord_sems.index(end_sem)
else:
end_ind = 100
curr_ind = ord_sems.index(semester)
if start_ind <= curr_ind <= end_ind:
course_title = course['title']
log.info(course_title + " will be downloaded")
course_id = course['course_id']
course_list[course_id] = course_title
return course_list
def get_top_folder(self, course):
"""Retrieves the top folder id of a given course.
Parameters:
course (string): course to get the top folder of
Returns:
string: id of the top folder
"""
rsp = self.auth_req('/api.php/course/' + course + '/top_folder')
top_folder = rsp.json()
tf_id = top_folder['id']
return(tf_id)
def get_docs(self, folder):
"""Get all the documents of a given folder.
Parameters:
folder(string): id of the folder to get documents of
Returns:
list(string): ids of the documents
"""
rsp = self.auth_req('/api.php/folder/' + folder)
res_docs = []
try:
docs = rsp.json()['file_refs']
except ValueError:
return res_docs
for doc in docs:
try:
doc_id = doc['id']
res_docs.append(doc_id)
except KeyError:
return res_docs
return(res_docs)
def download(self, doc):
"""Download a document.
Parameters:
doc (string): id of the document to download
"""
rsp1 = self.auth_req('/api.php/file/' + doc)
doc_name = rsp1.json()['name']
doc_chdate = rsp1.json()['chdate']
last_dl = self.db.get_last_file_dl(doc)
if last_dl == None or last_dl < doc_chdate:
rsp2 = self.auth_req('/api.php/file/' + doc + '/download')
log.info('downloading ' + doc_name)
try:
with open(doc_name, 'wb') as doc_file:
for chunk in rsp2.iter_content(self.CHUNK_SIZE):
doc_file.write(chunk)
self.db.set_last_file_dl(str(doc), str(int(time.time())))
except OSError:
log.warning("Error while writing to the file " + doc_name)
def get_subdirs(self, folder):
"""Get all the subdirectories of a given folder.
Parameters:
folder(string): id of the folder to get subdirectories of
Returns:
list(string): ids of the subdirectories
"""
res_subdirs = {}
rsp = self.auth_req('/api.php/folder/' + folder)
try:
subdirs = rsp.json()['subfolders']
except ValueError:
return res_subdirs
for subdir in subdirs:
try:
sub_id = subdir['id']
sub_name = subdir['name']
res_subdirs[sub_id] = sub_name
except KeyError:
return res_subdirs
return res_subdirs