- mysql creates database and tables to given mysql if not existent already - mysql reads last change values from db - mysql saves ch_date after downloading - run now takes care for the variables of mysql and studipmaster
@ -1,2 +1 @@ |
|||
last_dl.txt |
|||
data |
|||
database |
|||
@ -1,202 +0,0 @@ |
|||
#!/bin/env python3 |
|||
import time |
|||
import os |
|||
import argparse |
|||
|
|||
from tqdm import tqdm |
|||
import requests as req |
|||
from requests.auth import HTTPBasicAuth |
|||
|
|||
|
|||
def create_dir(dir): |
|||
if not os.path.exists(dir): |
|||
print('creating folder', dir) |
|||
os.mkdir(dir) |
|||
|
|||
|
|||
def set_last_dl(time): |
|||
last_dl_file = open('last_dl.txt', 'w') |
|||
last_dl_file.write(str(time).split('.')[0]) |
|||
|
|||
|
|||
def get_last_dl(): |
|||
try: |
|||
last_dl_file = open('last_dl.txt', 'r') |
|||
return int(last_dl_file.read()) |
|||
except: |
|||
return None |
|||
|
|||
parser = argparse.ArgumentParser(description='Download Files from StudIP.') |
|||
parser.add_argument('-o', '--output', type=str, |
|||
default='./data', help='path to output directory') |
|||
parser.add_argument('-u', '--user', type=str, help='studip username', required=True) |
|||
parser.add_argument('-p', '--passw', type=str, help='studip password', required=True) |
|||
parser.add_argument('-s', '--url', type=str, help='studip url', required=True) |
|||
parser.add_argument('-c', '--chunk', type=int, default=1024 * |
|||
1024, help='chunksize for downloading data') |
|||
parser.add_argument('-r', '--reset_dl_date', action='store_true') |
|||
|
|||
args = parser.parse_args() |
|||
|
|||
BASE_DIR = os.path.abspath(args.output) |
|||
CHUNK_SIZE = args.chunk |
|||
STUDIP_DOMAIN = args.url |
|||
USERNAME = args.user |
|||
PASSWORD = args.passw |
|||
USER = (USERNAME, PASSWORD) |
|||
if args.reset_dl_date: |
|||
set_last_dl(None) |
|||
LAST_DOWNLOAD = get_last_dl() |
|||
|
|||
|
|||
def get_uid(): |
|||
url = STUDIP_DOMAIN + '/api.php/user/' |
|||
rsp = req.get(url, auth=USER) |
|||
user_id = rsp.json()['user_id'] |
|||
return user_id |
|||
|
|||
|
|||
def get_curr_semester(): |
|||
url = STUDIP_DOMAIN + '/api.php/semesters/' |
|||
rsp = req.get(url, auth=USER) |
|||
curr_time = int(str(time.time()).split('.')[0]) |
|||
semesters = rsp.json()['collection'] |
|||
for sem_uri in semesters: |
|||
semester = semesters[sem_uri] |
|||
sem_begin = semester['begin'] |
|||
sem_end = semester['end'] |
|||
if sem_begin < curr_time < sem_end: |
|||
return sem_uri |
|||
return 0 |
|||
|
|||
|
|||
def get_ordered_semesters(): |
|||
url = STUDIP_DOMAIN + '/api.php/semesters/' |
|||
rsp = req.get(url, auth=USER) |
|||
semesters = rsp.json()['collection'] |
|||
order_sems = [] |
|||
for sem_uri in semesters: |
|||
order_sems.append(sem_uri) |
|||
return order_sems |
|||
|
|||
|
|||
def get_curr_courses(user_id, semester): |
|||
url = STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses' |
|||
rsp = req.get(url, auth=USER) |
|||
ord_sems = get_ordered_semesters() |
|||
courses = rsp.json()['collection'] |
|||
i = 0 |
|||
course_list = {} |
|||
for course_uri in courses: |
|||
course = courses[course_uri] |
|||
start_sem = course['start_semester'] |
|||
if start_sem != None: |
|||
start_ind = ord_sems.index(start_sem) |
|||
else: |
|||
start_ind = 100 |
|||
end_sem = course['end_semester'] |
|||
if end_sem != None: |
|||
end_ind = ord_sems.index(end_sem) |
|||
else: |
|||
end_ind = 100 |
|||
curr_ind = ord_sems.index(semester) |
|||
if start_ind <= curr_ind <= end_ind: |
|||
course_title = course['title'] |
|||
course_id = course['course_id'] |
|||
course_list[course_id] = course_title |
|||
return course_list |
|||
|
|||
|
|||
def get_top_folder(course): |
|||
url = STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder' |
|||
rsp = req.get(url, auth=USER) |
|||
top_folder = rsp.json() |
|||
tf_id = top_folder['id'] |
|||
return(tf_id) |
|||
|
|||
|
|||
def get_docs(folder): |
|||
url = STUDIP_DOMAIN + '/api.php/folder/' + folder |
|||
rsp = req.get(url, auth=USER) |
|||
docs = rsp.json()['file_refs'] |
|||
res_docs = [] |
|||
for doc in docs: |
|||
doc_id = doc['id'] |
|||
res_docs.append(doc_id) |
|||
return(res_docs) |
|||
|
|||
|
|||
def download(doc, time): |
|||
url1 = STUDIP_DOMAIN + '/api.php/file/' + doc |
|||
rsp1 = req.get(url1, auth=USER) |
|||
doc_name = rsp1.json()['name'] |
|||
doc_chdate = rsp1.json()['chdate'] |
|||
if time == None or time < doc_chdate: |
|||
print('downloading ', doc_name) |
|||
url2 = STUDIP_DOMAIN + '/api.php/file/' + doc + '/download' |
|||
rsp2 = req.get(url2, auth=USER, stream=True) |
|||
total_size = int(rsp2.headers.get('content-length', 0)) |
|||
progbar = tqdm(total=total_size, unit='iB', unit_scale=True) |
|||
with open(doc_name, 'wb') as doc: |
|||
for chunk in rsp2.iter_content(CHUNK_SIZE): |
|||
progbar.update(len(chunk)) |
|||
doc.write(chunk) |
|||
|
|||
|
|||
def get_subdirs(folder): |
|||
url = STUDIP_DOMAIN + '/api.php/folder/' + folder |
|||
rsp = req.get(url, auth=USER) |
|||
subdirs = rsp.json()['subfolders'] |
|||
docs = rsp.json()['file_refs'] |
|||
res_subdirs = {} |
|||
for subdir in subdirs: |
|||
sub_id = subdir['id'] |
|||
sub_name = subdir['name'] |
|||
res_subdirs[sub_id] = sub_name |
|||
return res_subdirs |
|||
|
|||
|
|||
def download_folder(folder, time): |
|||
docs = get_docs(folder) |
|||
for doc in docs: |
|||
print('found doc ', doc) |
|||
download(doc, time) |
|||
|
|||
|
|||
def download_folder_rec(folder, time, base_dir): |
|||
print('folder ', folder) |
|||
create_dir(base_dir) |
|||
download_folder(folder, time) |
|||
subdirs = get_subdirs(folder) |
|||
os.chdir(base_dir) |
|||
for subdir in subdirs: |
|||
subdir_name = subdirs[subdir].replace('/', '-') |
|||
subdir_path = os.path.join(base_dir, subdir_name) |
|||
print(subdir_path) |
|||
create_dir(subdir_path) |
|||
os.chdir(subdir_path) |
|||
download_folder_rec(subdir, time, subdir_path) |
|||
|
|||
|
|||
def download_course(course, time, base_dir): |
|||
print('course ', course) |
|||
create_dir(base_dir) |
|||
os.chdir(base_dir) |
|||
root = get_top_folder(course) |
|||
download_folder_rec(root, time, base_dir) |
|||
|
|||
|
|||
def download_curr_courses(time, base_dir): |
|||
print('Start downloading all current courses') |
|||
create_dir(base_dir) |
|||
curr_courses = get_curr_courses(get_uid(), get_curr_semester()) |
|||
os.chdir(base_dir) |
|||
for course in curr_courses: |
|||
print('course is ', curr_courses[course]) |
|||
course_name = curr_courses[course].replace('/', '-') |
|||
path = os.path.join(base_dir, course_name) |
|||
download_course(course, time, path) |
|||
|
|||
|
|||
download_curr_courses(LAST_DOWNLOAD, BASE_DIR) |
|||
set_last_dl(time.time()) |
|||
@ -0,0 +1,2 @@ |
|||
__pycache__ |
|||
data |
|||
@ -0,0 +1,64 @@ |
|||
#!/bin/env python3 |
|||
import time |
|||
|
|||
import pymysql |
|||
|
|||
|
|||
class Database: |
|||
|
|||
def __init__(self): |
|||
self.HOST = None |
|||
self.PORT = None |
|||
self.DB_NAME = None |
|||
self.USER = None |
|||
self.PASSW = None |
|||
self.TABLE_FILE = None |
|||
self.TABLE_FILE = 'files' |
|||
self.RESET_DL_DATE = False |
|||
|
|||
def connect(self): |
|||
return pymysql.connect( |
|||
host=self.HOST, |
|||
port=self.PORT, |
|||
user=self.USER, |
|||
password=self.PASSW, |
|||
charset='utf8mb4', |
|||
cursorclass=pymysql.cursors.DictCursor |
|||
) |
|||
|
|||
def setup_db(self): |
|||
db = self.connect() |
|||
crs = db.cursor() |
|||
sql_query = "CREATE DATABASE IF NOT EXISTS " + self.DB_NAME |
|||
crs.execute(sql_query) |
|||
db.select_db(self.DB_NAME) |
|||
query = "CREATE TABLE IF NOT EXISTS " + self.TABLE_FILE + \ |
|||
"(id CHAR(32) NOT NULL," + \ |
|||
"ch_date INT(11) NOT NULL," + \ |
|||
"PRIMARY KEY(id))" |
|||
crs.execute(query) |
|||
print(db) |
|||
|
|||
def set_last_file_dl(self, file_id, time): |
|||
db = self.connect() |
|||
db.select_db(self.DB_NAME) |
|||
crs = db.cursor() |
|||
print('file: ', file_id, ' time: ', time) |
|||
query = "INSERT INTO " + self.TABLE_FILE + "(`id`,`ch_date`)" + \ |
|||
"VALUES ('" + file_id + "','" + time + "')" + \ |
|||
"ON DUPLICATE KEY UPDATE `ch_date` = '" + time + "'" |
|||
crs.execute(query) |
|||
db.commit() |
|||
|
|||
def get_last_file_dl(self, file_id): |
|||
if self.RESET_DL_DATE: |
|||
return None |
|||
db = self.connect() |
|||
db.select_db(self.DB_NAME) |
|||
crs = db.cursor() |
|||
query = "SELECT ch_date FROM files WHERE id ='" + file_id + "'" |
|||
crs.execute(query) |
|||
res = crs.fetchone() |
|||
if res != None: |
|||
return res['ch_date'] |
|||
return None |
|||
@ -1,2 +1,3 @@ |
|||
tqdm==4.46.1 |
|||
requests==2.23.0 |
|||
PyMySQL==0.9.3 |
|||
@ -0,0 +1,47 @@ |
|||
#!/bin/env python3 |
|||
import os |
|||
import argparse |
|||
|
|||
import studip |
|||
import mysql |
|||
|
|||
|
|||
parser = argparse.ArgumentParser(description='Download Files from StudIP.') |
|||
parser.add_argument('-o', '--output', type=str, |
|||
default='./data', help='path to output directory') |
|||
parser.add_argument('-u', '--user', type=str, |
|||
help='studip username', required=True) |
|||
parser.add_argument('-p', '--passwd', type=str, |
|||
help='studip password', required=True) |
|||
parser.add_argument('-s', '--url', type=str, help='studip url', required=True) |
|||
parser.add_argument('--chunk', type=int, default=1024 * |
|||
1024, help='chunksize for downloading data') |
|||
parser.add_argument('-r', '--reset_dl_date', action='store_true', help='downloads everything and ignores last download date') |
|||
parser.add_argument('--host', type=str, default='localhost', help='mysql host') |
|||
parser.add_argument('--port', type=int, default=3306, help='mysql port') |
|||
parser.add_argument('--db_name', type=str, default='studip', help='mysql database name') |
|||
parser.add_argument('--db_user', type=str, default='root', help='mysql database user') |
|||
parser.add_argument('--db_passwd', type=str, default='secret-pw', help='mysql database password') |
|||
args = parser.parse_args() |
|||
|
|||
BASE_DIR = os.path.abspath(args.output) |
|||
USERNAME = args.user |
|||
PASSWORD = args.passwd |
|||
|
|||
db = mysql.Database() |
|||
|
|||
db.HOST = args.host |
|||
db.PORT = args.port |
|||
db.DB_NAME = args.db_name |
|||
db.USER = args.db_user |
|||
db.PASSW = args.db_passwd |
|||
db.RESET_DL_DATE = args.reset_dl_date |
|||
db.setup_db() |
|||
|
|||
crwlr = studip.Crawler(db) |
|||
|
|||
crwlr.CHUNK_SIZE = args.chunk |
|||
crwlr.STUDIP_DOMAIN = args.url |
|||
crwlr.USER = (USERNAME, PASSWORD) |
|||
|
|||
crwlr.download_curr_courses(BASE_DIR) |
|||
@ -0,0 +1,162 @@ |
|||
#!/bin/env python3 |
|||
import time |
|||
import os |
|||
import argparse |
|||
|
|||
from tqdm import tqdm |
|||
import requests as req |
|||
from requests.auth import HTTPBasicAuth |
|||
|
|||
|
|||
class Crawler: |
|||
|
|||
def __init__(self, db): |
|||
self.CHUNK_SIZE = None |
|||
self.STUDIP_DOMAIN = None |
|||
self.USER = None |
|||
self.db = db |
|||
|
|||
def create_dir(self, dir): |
|||
if not os.path.exists(dir): |
|||
print('creating folder', dir) |
|||
os.mkdir(dir) |
|||
|
|||
def get_uid(self): |
|||
url = self.STUDIP_DOMAIN + '/api.php/user/' |
|||
rsp = req.get(url, auth=self.USER) |
|||
user_id = rsp.json()['user_id'] |
|||
return user_id |
|||
|
|||
def get_curr_semester(self): |
|||
url = self.STUDIP_DOMAIN + '/api.php/semesters/' |
|||
rsp = req.get(url, auth=self.USER) |
|||
curr_time = int(str(int(time.time()))) |
|||
semesters = rsp.json()['collection'] |
|||
for sem_uri in semesters: |
|||
semester = semesters[sem_uri] |
|||
sem_begin = semester['begin'] |
|||
sem_end = semester['end'] |
|||
if sem_begin < curr_time < sem_end: |
|||
return sem_uri |
|||
return 0 |
|||
|
|||
def get_ordered_semesters(self): |
|||
url = self.STUDIP_DOMAIN + '/api.php/semesters/' |
|||
rsp = req.get(url, auth=self.USER) |
|||
semesters = rsp.json()['collection'] |
|||
order_sems = [] |
|||
for sem_uri in semesters: |
|||
order_sems.append(sem_uri) |
|||
return order_sems |
|||
|
|||
def get_curr_courses(self, user_id, semester): |
|||
url = self.STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses' |
|||
rsp = req.get(url, auth=self.USER) |
|||
ord_sems = self.get_ordered_semesters() |
|||
courses = rsp.json()['collection'] |
|||
i = 0 |
|||
course_list = {} |
|||
for course_uri in courses: |
|||
course = courses[course_uri] |
|||
start_sem = course['start_semester'] |
|||
if start_sem != None: |
|||
start_ind = ord_sems.index(start_sem) |
|||
else: |
|||
start_ind = 100 |
|||
end_sem = course['end_semester'] |
|||
if end_sem != None: |
|||
end_ind = ord_sems.index(end_sem) |
|||
else: |
|||
end_ind = 100 |
|||
curr_ind = ord_sems.index(semester) |
|||
if start_ind <= curr_ind <= end_ind: |
|||
course_title = course['title'] |
|||
course_id = course['course_id'] |
|||
course_list[course_id] = course_title |
|||
return course_list |
|||
|
|||
def get_top_folder(self, course): |
|||
url = self.STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder' |
|||
rsp = req.get(url, auth=self.USER) |
|||
top_folder = rsp.json() |
|||
tf_id = top_folder['id'] |
|||
return(tf_id) |
|||
|
|||
def get_docs(self, folder): |
|||
url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder |
|||
rsp = req.get(url, auth=self.USER) |
|||
docs = rsp.json()['file_refs'] |
|||
res_docs = [] |
|||
for doc in docs: |
|||
doc_id = doc['id'] |
|||
res_docs.append(doc_id) |
|||
return(res_docs) |
|||
|
|||
def download(self, doc): |
|||
url1 = self.STUDIP_DOMAIN + '/api.php/file/' + doc |
|||
rsp1 = req.get(url1, auth=self.USER) |
|||
doc_name = rsp1.json()['name'] |
|||
doc_chdate = rsp1.json()['chdate'] |
|||
last_dl = self.db.get_last_file_dl(doc) |
|||
if last_dl == None or last_dl < doc_chdate: |
|||
print('downloading ', doc_name) |
|||
url2 = self.STUDIP_DOMAIN + '/api.php/file/' + doc + '/download' |
|||
rsp2 = req.get(url2, auth=self.USER, stream=True) |
|||
total_size = int(rsp2.headers.get('content-length', 0)) |
|||
progbar = tqdm(total=total_size, unit='iB', unit_scale=True) |
|||
with open(doc_name, 'wb') as doc_file: |
|||
for chunk in rsp2.iter_content(self.CHUNK_SIZE): |
|||
progbar.update(len(chunk)) |
|||
doc_file.write(chunk) |
|||
self.db.set_last_file_dl(str(doc), str(int(time.time()))) |
|||
|
|||
def get_subdirs(self, folder): |
|||
url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder |
|||
rsp = req.get(url, auth=self.USER) |
|||
subdirs = rsp.json()['subfolders'] |
|||
docs = rsp.json()['file_refs'] |
|||
res_subdirs = {} |
|||
for subdir in subdirs: |
|||
sub_id = subdir['id'] |
|||
sub_name = subdir['name'] |
|||
res_subdirs[sub_id] = sub_name |
|||
return res_subdirs |
|||
|
|||
def download_folder(self, folder): |
|||
docs = self.get_docs(folder) |
|||
for doc in docs: |
|||
print('found doc ', doc) |
|||
self.download(doc) |
|||
|
|||
def download_folder_rec(self, folder, base_dir): |
|||
print('folder ', folder) |
|||
self.create_dir(base_dir) |
|||
self.download_folder(folder) |
|||
subdirs = self.get_subdirs(folder) |
|||
os.chdir(base_dir) |
|||
for subdir in subdirs: |
|||
subdir_name = subdirs[subdir].replace('/', '-') |
|||
subdir_path = os.path.join(base_dir, subdir_name) |
|||
print(subdir_path) |
|||
self.create_dir(subdir_path) |
|||
os.chdir(subdir_path) |
|||
self.download_folder_rec(subdir, subdir_path) |
|||
|
|||
def download_course(self, course, base_dir): |
|||
print('course ', course) |
|||
self.create_dir(base_dir) |
|||
os.chdir(base_dir) |
|||
root = self.get_top_folder(course) |
|||
self.download_folder_rec(root, base_dir) |
|||
|
|||
def download_curr_courses(self, base_dir): |
|||
print('Start downloading all current courses') |
|||
self.create_dir(base_dir) |
|||
curr_courses = self.get_curr_courses( |
|||
self.get_uid(), self.get_curr_semester()) |
|||
os.chdir(base_dir) |
|||
for course in curr_courses: |
|||
print('course is ', curr_courses[course]) |
|||
course_name = curr_courses[course].replace('/', '-') |
|||
path = os.path.join(base_dir, course_name) |
|||
self.download_course(course, path) |
|||