- mysql creates database and tables to given mysql if not existent already - mysql reads last change values from db - mysql saves ch_date after downloading - run now takes care for the variables of mysql and studipmaster
parent
6d18baa8b6
commit
fa36e0f29e
@ -1,2 +1 @@
|
|||||||
last_dl.txt
|
database
|
||||||
data
|
|
||||||
|
@ -1,202 +0,0 @@
|
|||||||
#!/bin/env python3
|
|
||||||
import time
|
|
||||||
import os
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
from tqdm import tqdm
|
|
||||||
import requests as req
|
|
||||||
from requests.auth import HTTPBasicAuth
|
|
||||||
|
|
||||||
|
|
||||||
def create_dir(dir):
|
|
||||||
if not os.path.exists(dir):
|
|
||||||
print('creating folder', dir)
|
|
||||||
os.mkdir(dir)
|
|
||||||
|
|
||||||
|
|
||||||
def set_last_dl(time):
|
|
||||||
last_dl_file = open('last_dl.txt', 'w')
|
|
||||||
last_dl_file.write(str(time).split('.')[0])
|
|
||||||
|
|
||||||
|
|
||||||
def get_last_dl():
|
|
||||||
try:
|
|
||||||
last_dl_file = open('last_dl.txt', 'r')
|
|
||||||
return int(last_dl_file.read())
|
|
||||||
except:
|
|
||||||
return None
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Download Files from StudIP.')
|
|
||||||
parser.add_argument('-o', '--output', type=str,
|
|
||||||
default='./data', help='path to output directory')
|
|
||||||
parser.add_argument('-u', '--user', type=str, help='studip username', required=True)
|
|
||||||
parser.add_argument('-p', '--passw', type=str, help='studip password', required=True)
|
|
||||||
parser.add_argument('-s', '--url', type=str, help='studip url', required=True)
|
|
||||||
parser.add_argument('-c', '--chunk', type=int, default=1024 *
|
|
||||||
1024, help='chunksize for downloading data')
|
|
||||||
parser.add_argument('-r', '--reset_dl_date', action='store_true')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
BASE_DIR = os.path.abspath(args.output)
|
|
||||||
CHUNK_SIZE = args.chunk
|
|
||||||
STUDIP_DOMAIN = args.url
|
|
||||||
USERNAME = args.user
|
|
||||||
PASSWORD = args.passw
|
|
||||||
USER = (USERNAME, PASSWORD)
|
|
||||||
if args.reset_dl_date:
|
|
||||||
set_last_dl(None)
|
|
||||||
LAST_DOWNLOAD = get_last_dl()
|
|
||||||
|
|
||||||
|
|
||||||
def get_uid():
|
|
||||||
url = STUDIP_DOMAIN + '/api.php/user/'
|
|
||||||
rsp = req.get(url, auth=USER)
|
|
||||||
user_id = rsp.json()['user_id']
|
|
||||||
return user_id
|
|
||||||
|
|
||||||
|
|
||||||
def get_curr_semester():
|
|
||||||
url = STUDIP_DOMAIN + '/api.php/semesters/'
|
|
||||||
rsp = req.get(url, auth=USER)
|
|
||||||
curr_time = int(str(time.time()).split('.')[0])
|
|
||||||
semesters = rsp.json()['collection']
|
|
||||||
for sem_uri in semesters:
|
|
||||||
semester = semesters[sem_uri]
|
|
||||||
sem_begin = semester['begin']
|
|
||||||
sem_end = semester['end']
|
|
||||||
if sem_begin < curr_time < sem_end:
|
|
||||||
return sem_uri
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
def get_ordered_semesters():
|
|
||||||
url = STUDIP_DOMAIN + '/api.php/semesters/'
|
|
||||||
rsp = req.get(url, auth=USER)
|
|
||||||
semesters = rsp.json()['collection']
|
|
||||||
order_sems = []
|
|
||||||
for sem_uri in semesters:
|
|
||||||
order_sems.append(sem_uri)
|
|
||||||
return order_sems
|
|
||||||
|
|
||||||
|
|
||||||
def get_curr_courses(user_id, semester):
|
|
||||||
url = STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses'
|
|
||||||
rsp = req.get(url, auth=USER)
|
|
||||||
ord_sems = get_ordered_semesters()
|
|
||||||
courses = rsp.json()['collection']
|
|
||||||
i = 0
|
|
||||||
course_list = {}
|
|
||||||
for course_uri in courses:
|
|
||||||
course = courses[course_uri]
|
|
||||||
start_sem = course['start_semester']
|
|
||||||
if start_sem != None:
|
|
||||||
start_ind = ord_sems.index(start_sem)
|
|
||||||
else:
|
|
||||||
start_ind = 100
|
|
||||||
end_sem = course['end_semester']
|
|
||||||
if end_sem != None:
|
|
||||||
end_ind = ord_sems.index(end_sem)
|
|
||||||
else:
|
|
||||||
end_ind = 100
|
|
||||||
curr_ind = ord_sems.index(semester)
|
|
||||||
if start_ind <= curr_ind <= end_ind:
|
|
||||||
course_title = course['title']
|
|
||||||
course_id = course['course_id']
|
|
||||||
course_list[course_id] = course_title
|
|
||||||
return course_list
|
|
||||||
|
|
||||||
|
|
||||||
def get_top_folder(course):
|
|
||||||
url = STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder'
|
|
||||||
rsp = req.get(url, auth=USER)
|
|
||||||
top_folder = rsp.json()
|
|
||||||
tf_id = top_folder['id']
|
|
||||||
return(tf_id)
|
|
||||||
|
|
||||||
|
|
||||||
def get_docs(folder):
|
|
||||||
url = STUDIP_DOMAIN + '/api.php/folder/' + folder
|
|
||||||
rsp = req.get(url, auth=USER)
|
|
||||||
docs = rsp.json()['file_refs']
|
|
||||||
res_docs = []
|
|
||||||
for doc in docs:
|
|
||||||
doc_id = doc['id']
|
|
||||||
res_docs.append(doc_id)
|
|
||||||
return(res_docs)
|
|
||||||
|
|
||||||
|
|
||||||
def download(doc, time):
|
|
||||||
url1 = STUDIP_DOMAIN + '/api.php/file/' + doc
|
|
||||||
rsp1 = req.get(url1, auth=USER)
|
|
||||||
doc_name = rsp1.json()['name']
|
|
||||||
doc_chdate = rsp1.json()['chdate']
|
|
||||||
if time == None or time < doc_chdate:
|
|
||||||
print('downloading ', doc_name)
|
|
||||||
url2 = STUDIP_DOMAIN + '/api.php/file/' + doc + '/download'
|
|
||||||
rsp2 = req.get(url2, auth=USER, stream=True)
|
|
||||||
total_size = int(rsp2.headers.get('content-length', 0))
|
|
||||||
progbar = tqdm(total=total_size, unit='iB', unit_scale=True)
|
|
||||||
with open(doc_name, 'wb') as doc:
|
|
||||||
for chunk in rsp2.iter_content(CHUNK_SIZE):
|
|
||||||
progbar.update(len(chunk))
|
|
||||||
doc.write(chunk)
|
|
||||||
|
|
||||||
|
|
||||||
def get_subdirs(folder):
|
|
||||||
url = STUDIP_DOMAIN + '/api.php/folder/' + folder
|
|
||||||
rsp = req.get(url, auth=USER)
|
|
||||||
subdirs = rsp.json()['subfolders']
|
|
||||||
docs = rsp.json()['file_refs']
|
|
||||||
res_subdirs = {}
|
|
||||||
for subdir in subdirs:
|
|
||||||
sub_id = subdir['id']
|
|
||||||
sub_name = subdir['name']
|
|
||||||
res_subdirs[sub_id] = sub_name
|
|
||||||
return res_subdirs
|
|
||||||
|
|
||||||
|
|
||||||
def download_folder(folder, time):
|
|
||||||
docs = get_docs(folder)
|
|
||||||
for doc in docs:
|
|
||||||
print('found doc ', doc)
|
|
||||||
download(doc, time)
|
|
||||||
|
|
||||||
|
|
||||||
def download_folder_rec(folder, time, base_dir):
|
|
||||||
print('folder ', folder)
|
|
||||||
create_dir(base_dir)
|
|
||||||
download_folder(folder, time)
|
|
||||||
subdirs = get_subdirs(folder)
|
|
||||||
os.chdir(base_dir)
|
|
||||||
for subdir in subdirs:
|
|
||||||
subdir_name = subdirs[subdir].replace('/', '-')
|
|
||||||
subdir_path = os.path.join(base_dir, subdir_name)
|
|
||||||
print(subdir_path)
|
|
||||||
create_dir(subdir_path)
|
|
||||||
os.chdir(subdir_path)
|
|
||||||
download_folder_rec(subdir, time, subdir_path)
|
|
||||||
|
|
||||||
|
|
||||||
def download_course(course, time, base_dir):
|
|
||||||
print('course ', course)
|
|
||||||
create_dir(base_dir)
|
|
||||||
os.chdir(base_dir)
|
|
||||||
root = get_top_folder(course)
|
|
||||||
download_folder_rec(root, time, base_dir)
|
|
||||||
|
|
||||||
|
|
||||||
def download_curr_courses(time, base_dir):
|
|
||||||
print('Start downloading all current courses')
|
|
||||||
create_dir(base_dir)
|
|
||||||
curr_courses = get_curr_courses(get_uid(), get_curr_semester())
|
|
||||||
os.chdir(base_dir)
|
|
||||||
for course in curr_courses:
|
|
||||||
print('course is ', curr_courses[course])
|
|
||||||
course_name = curr_courses[course].replace('/', '-')
|
|
||||||
path = os.path.join(base_dir, course_name)
|
|
||||||
download_course(course, time, path)
|
|
||||||
|
|
||||||
|
|
||||||
download_curr_courses(LAST_DOWNLOAD, BASE_DIR)
|
|
||||||
set_last_dl(time.time())
|
|
@ -0,0 +1,2 @@
|
|||||||
|
__pycache__
|
||||||
|
data
|
@ -0,0 +1,64 @@
|
|||||||
|
#!/bin/env python3
|
||||||
|
import time
|
||||||
|
|
||||||
|
import pymysql
|
||||||
|
|
||||||
|
|
||||||
|
class Database:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.HOST = None
|
||||||
|
self.PORT = None
|
||||||
|
self.DB_NAME = None
|
||||||
|
self.USER = None
|
||||||
|
self.PASSW = None
|
||||||
|
self.TABLE_FILE = None
|
||||||
|
self.TABLE_FILE = 'files'
|
||||||
|
self.RESET_DL_DATE = False
|
||||||
|
|
||||||
|
def connect(self):
|
||||||
|
return pymysql.connect(
|
||||||
|
host=self.HOST,
|
||||||
|
port=self.PORT,
|
||||||
|
user=self.USER,
|
||||||
|
password=self.PASSW,
|
||||||
|
charset='utf8mb4',
|
||||||
|
cursorclass=pymysql.cursors.DictCursor
|
||||||
|
)
|
||||||
|
|
||||||
|
def setup_db(self):
|
||||||
|
db = self.connect()
|
||||||
|
crs = db.cursor()
|
||||||
|
sql_query = "CREATE DATABASE IF NOT EXISTS " + self.DB_NAME
|
||||||
|
crs.execute(sql_query)
|
||||||
|
db.select_db(self.DB_NAME)
|
||||||
|
query = "CREATE TABLE IF NOT EXISTS " + self.TABLE_FILE + \
|
||||||
|
"(id CHAR(32) NOT NULL," + \
|
||||||
|
"ch_date INT(11) NOT NULL," + \
|
||||||
|
"PRIMARY KEY(id))"
|
||||||
|
crs.execute(query)
|
||||||
|
print(db)
|
||||||
|
|
||||||
|
def set_last_file_dl(self, file_id, time):
|
||||||
|
db = self.connect()
|
||||||
|
db.select_db(self.DB_NAME)
|
||||||
|
crs = db.cursor()
|
||||||
|
print('file: ', file_id, ' time: ', time)
|
||||||
|
query = "INSERT INTO " + self.TABLE_FILE + "(`id`,`ch_date`)" + \
|
||||||
|
"VALUES ('" + file_id + "','" + time + "')" + \
|
||||||
|
"ON DUPLICATE KEY UPDATE `ch_date` = '" + time + "'"
|
||||||
|
crs.execute(query)
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
def get_last_file_dl(self, file_id):
|
||||||
|
if self.RESET_DL_DATE:
|
||||||
|
return None
|
||||||
|
db = self.connect()
|
||||||
|
db.select_db(self.DB_NAME)
|
||||||
|
crs = db.cursor()
|
||||||
|
query = "SELECT ch_date FROM files WHERE id ='" + file_id + "'"
|
||||||
|
crs.execute(query)
|
||||||
|
res = crs.fetchone()
|
||||||
|
if res != None:
|
||||||
|
return res['ch_date']
|
||||||
|
return None
|
@ -1,2 +1,3 @@
|
|||||||
tqdm==4.46.1
|
tqdm==4.46.1
|
||||||
requests==2.23.0
|
requests==2.23.0
|
||||||
|
PyMySQL==0.9.3
|
@ -0,0 +1,47 @@
|
|||||||
|
#!/bin/env python3
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import studip
|
||||||
|
import mysql
|
||||||
|
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='Download Files from StudIP.')
|
||||||
|
parser.add_argument('-o', '--output', type=str,
|
||||||
|
default='./data', help='path to output directory')
|
||||||
|
parser.add_argument('-u', '--user', type=str,
|
||||||
|
help='studip username', required=True)
|
||||||
|
parser.add_argument('-p', '--passwd', type=str,
|
||||||
|
help='studip password', required=True)
|
||||||
|
parser.add_argument('-s', '--url', type=str, help='studip url', required=True)
|
||||||
|
parser.add_argument('--chunk', type=int, default=1024 *
|
||||||
|
1024, help='chunksize for downloading data')
|
||||||
|
parser.add_argument('-r', '--reset_dl_date', action='store_true', help='downloads everything and ignores last download date')
|
||||||
|
parser.add_argument('--host', type=str, default='localhost', help='mysql host')
|
||||||
|
parser.add_argument('--port', type=int, default=3306, help='mysql port')
|
||||||
|
parser.add_argument('--db_name', type=str, default='studip', help='mysql database name')
|
||||||
|
parser.add_argument('--db_user', type=str, default='root', help='mysql database user')
|
||||||
|
parser.add_argument('--db_passwd', type=str, default='secret-pw', help='mysql database password')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
BASE_DIR = os.path.abspath(args.output)
|
||||||
|
USERNAME = args.user
|
||||||
|
PASSWORD = args.passwd
|
||||||
|
|
||||||
|
db = mysql.Database()
|
||||||
|
|
||||||
|
db.HOST = args.host
|
||||||
|
db.PORT = args.port
|
||||||
|
db.DB_NAME = args.db_name
|
||||||
|
db.USER = args.db_user
|
||||||
|
db.PASSW = args.db_passwd
|
||||||
|
db.RESET_DL_DATE = args.reset_dl_date
|
||||||
|
db.setup_db()
|
||||||
|
|
||||||
|
crwlr = studip.Crawler(db)
|
||||||
|
|
||||||
|
crwlr.CHUNK_SIZE = args.chunk
|
||||||
|
crwlr.STUDIP_DOMAIN = args.url
|
||||||
|
crwlr.USER = (USERNAME, PASSWORD)
|
||||||
|
|
||||||
|
crwlr.download_curr_courses(BASE_DIR)
|
@ -0,0 +1,162 @@
|
|||||||
|
#!/bin/env python3
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
import requests as req
|
||||||
|
from requests.auth import HTTPBasicAuth
|
||||||
|
|
||||||
|
|
||||||
|
class Crawler:
|
||||||
|
|
||||||
|
def __init__(self, db):
|
||||||
|
self.CHUNK_SIZE = None
|
||||||
|
self.STUDIP_DOMAIN = None
|
||||||
|
self.USER = None
|
||||||
|
self.db = db
|
||||||
|
|
||||||
|
def create_dir(self, dir):
|
||||||
|
if not os.path.exists(dir):
|
||||||
|
print('creating folder', dir)
|
||||||
|
os.mkdir(dir)
|
||||||
|
|
||||||
|
def get_uid(self):
|
||||||
|
url = self.STUDIP_DOMAIN + '/api.php/user/'
|
||||||
|
rsp = req.get(url, auth=self.USER)
|
||||||
|
user_id = rsp.json()['user_id']
|
||||||
|
return user_id
|
||||||
|
|
||||||
|
def get_curr_semester(self):
|
||||||
|
url = self.STUDIP_DOMAIN + '/api.php/semesters/'
|
||||||
|
rsp = req.get(url, auth=self.USER)
|
||||||
|
curr_time = int(str(int(time.time())))
|
||||||
|
semesters = rsp.json()['collection']
|
||||||
|
for sem_uri in semesters:
|
||||||
|
semester = semesters[sem_uri]
|
||||||
|
sem_begin = semester['begin']
|
||||||
|
sem_end = semester['end']
|
||||||
|
if sem_begin < curr_time < sem_end:
|
||||||
|
return sem_uri
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def get_ordered_semesters(self):
|
||||||
|
url = self.STUDIP_DOMAIN + '/api.php/semesters/'
|
||||||
|
rsp = req.get(url, auth=self.USER)
|
||||||
|
semesters = rsp.json()['collection']
|
||||||
|
order_sems = []
|
||||||
|
for sem_uri in semesters:
|
||||||
|
order_sems.append(sem_uri)
|
||||||
|
return order_sems
|
||||||
|
|
||||||
|
def get_curr_courses(self, user_id, semester):
|
||||||
|
url = self.STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses'
|
||||||
|
rsp = req.get(url, auth=self.USER)
|
||||||
|
ord_sems = self.get_ordered_semesters()
|
||||||
|
courses = rsp.json()['collection']
|
||||||
|
i = 0
|
||||||
|
course_list = {}
|
||||||
|
for course_uri in courses:
|
||||||
|
course = courses[course_uri]
|
||||||
|
start_sem = course['start_semester']
|
||||||
|
if start_sem != None:
|
||||||
|
start_ind = ord_sems.index(start_sem)
|
||||||
|
else:
|
||||||
|
start_ind = 100
|
||||||
|
end_sem = course['end_semester']
|
||||||
|
if end_sem != None:
|
||||||
|
end_ind = ord_sems.index(end_sem)
|
||||||
|
else:
|
||||||
|
end_ind = 100
|
||||||
|
curr_ind = ord_sems.index(semester)
|
||||||
|
if start_ind <= curr_ind <= end_ind:
|
||||||
|
course_title = course['title']
|
||||||
|
course_id = course['course_id']
|
||||||
|
course_list[course_id] = course_title
|
||||||
|
return course_list
|
||||||
|
|
||||||
|
def get_top_folder(self, course):
|
||||||
|
url = self.STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder'
|
||||||
|
rsp = req.get(url, auth=self.USER)
|
||||||
|
top_folder = rsp.json()
|
||||||
|
tf_id = top_folder['id']
|
||||||
|
return(tf_id)
|
||||||
|
|
||||||
|
def get_docs(self, folder):
|
||||||
|
url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder
|
||||||
|
rsp = req.get(url, auth=self.USER)
|
||||||
|
docs = rsp.json()['file_refs']
|
||||||
|
res_docs = []
|
||||||
|
for doc in docs:
|
||||||
|
doc_id = doc['id']
|
||||||
|
res_docs.append(doc_id)
|
||||||
|
return(res_docs)
|
||||||
|
|
||||||
|
def download(self, doc):
|
||||||
|
url1 = self.STUDIP_DOMAIN + '/api.php/file/' + doc
|
||||||
|
rsp1 = req.get(url1, auth=self.USER)
|
||||||
|
doc_name = rsp1.json()['name']
|
||||||
|
doc_chdate = rsp1.json()['chdate']
|
||||||
|
last_dl = self.db.get_last_file_dl(doc)
|
||||||
|
if last_dl == None or last_dl < doc_chdate:
|
||||||
|
print('downloading ', doc_name)
|
||||||
|
url2 = self.STUDIP_DOMAIN + '/api.php/file/' + doc + '/download'
|
||||||
|
rsp2 = req.get(url2, auth=self.USER, stream=True)
|
||||||
|
total_size = int(rsp2.headers.get('content-length', 0))
|
||||||
|
progbar = tqdm(total=total_size, unit='iB', unit_scale=True)
|
||||||
|
with open(doc_name, 'wb') as doc_file:
|
||||||
|
for chunk in rsp2.iter_content(self.CHUNK_SIZE):
|
||||||
|
progbar.update(len(chunk))
|
||||||
|
doc_file.write(chunk)
|
||||||
|
self.db.set_last_file_dl(str(doc), str(int(time.time())))
|
||||||
|
|
||||||
|
def get_subdirs(self, folder):
|
||||||
|
url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder
|
||||||
|
rsp = req.get(url, auth=self.USER)
|
||||||
|
subdirs = rsp.json()['subfolders']
|
||||||
|
docs = rsp.json()['file_refs']
|
||||||
|
res_subdirs = {}
|
||||||
|
for subdir in subdirs:
|
||||||
|
sub_id = subdir['id']
|
||||||
|
sub_name = subdir['name']
|
||||||
|
res_subdirs[sub_id] = sub_name
|
||||||
|
return res_subdirs
|
||||||
|
|
||||||
|
def download_folder(self, folder):
|
||||||
|
docs = self.get_docs(folder)
|
||||||
|
for doc in docs:
|
||||||
|
print('found doc ', doc)
|
||||||
|
self.download(doc)
|
||||||
|
|
||||||
|
def download_folder_rec(self, folder, base_dir):
|
||||||
|
print('folder ', folder)
|
||||||
|
self.create_dir(base_dir)
|
||||||
|
self.download_folder(folder)
|
||||||
|
subdirs = self.get_subdirs(folder)
|
||||||
|
os.chdir(base_dir)
|
||||||
|
for subdir in subdirs:
|
||||||
|
subdir_name = subdirs[subdir].replace('/', '-')
|
||||||
|
subdir_path = os.path.join(base_dir, subdir_name)
|
||||||
|
print(subdir_path)
|
||||||
|
self.create_dir(subdir_path)
|
||||||
|
os.chdir(subdir_path)
|
||||||
|
self.download_folder_rec(subdir, subdir_path)
|
||||||
|
|
||||||
|
def download_course(self, course, base_dir):
|
||||||
|
print('course ', course)
|
||||||
|
self.create_dir(base_dir)
|
||||||
|
os.chdir(base_dir)
|
||||||
|
root = self.get_top_folder(course)
|
||||||
|
self.download_folder_rec(root, base_dir)
|
||||||
|
|
||||||
|
def download_curr_courses(self, base_dir):
|
||||||
|
print('Start downloading all current courses')
|
||||||
|
self.create_dir(base_dir)
|
||||||
|
curr_courses = self.get_curr_courses(
|
||||||
|
self.get_uid(), self.get_curr_semester())
|
||||||
|
os.chdir(base_dir)
|
||||||
|
for course in curr_courses:
|
||||||
|
print('course is ', curr_courses[course])
|
||||||
|
course_name = curr_courses[course].replace('/', '-')
|
||||||
|
path = os.path.join(base_dir, course_name)
|
||||||
|
self.download_course(course, path)
|
Loading…
Reference in new issue