mirror of
https://github.com/tiyn/stud.ip-crawler.git
synced 2025-04-04 08:57:46 +02:00
log: adding basic log with levels and file
The log has options for several levels that can be set from the command line. The file is hardcoded as log.txt and can be toggled
This commit is contained in:
parent
fa36e0f29e
commit
7ef3f063d2
@ -19,9 +19,10 @@ If you run the program again it only downloads files that have changed since the
|
|||||||
- [x] Possible reset of download date
|
- [x] Possible reset of download date
|
||||||
- [x] Incremental file download
|
- [x] Incremental file download
|
||||||
- [x] Store id and chdate of downloaded files
|
- [x] Store id and chdate of downloaded files
|
||||||
- [ ] Logging
|
- [x] Logging
|
||||||
- [x] Console log
|
- [x] Console log
|
||||||
- [ ] Log file
|
- [x] Log file
|
||||||
|
- [x] Specify log level
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
|
1
src/.gitignore
vendored
1
src/.gitignore
vendored
@ -1,2 +1,3 @@
|
|||||||
__pycache__
|
__pycache__
|
||||||
data
|
data
|
||||||
|
log.txt
|
||||||
|
54
src/crawler.py
Normal file
54
src/crawler.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
import os
|
||||||
|
import logging as log
|
||||||
|
|
||||||
|
from studip import Studip
|
||||||
|
|
||||||
|
|
||||||
|
class Crawler:
|
||||||
|
|
||||||
|
def __init__(self, studip):
|
||||||
|
self.studip = studip
|
||||||
|
|
||||||
|
def download_folder(self, folder):
|
||||||
|
docs = self.studip.get_docs(folder)
|
||||||
|
for doc in docs:
|
||||||
|
log.info('found doc ' + doc)
|
||||||
|
self.studip.download(doc)
|
||||||
|
|
||||||
|
def download_folder_rec(self, folder, base_dir):
|
||||||
|
log.info('crawling folder ' + folder)
|
||||||
|
self.create_dir(base_dir)
|
||||||
|
self.download_folder(folder)
|
||||||
|
subdirs = self.studip.get_subdirs(folder)
|
||||||
|
os.chdir(base_dir)
|
||||||
|
for subdir in subdirs:
|
||||||
|
subdir_name = subdirs[subdir].replace('/', '-')
|
||||||
|
subdir_path = os.path.join(base_dir, subdir_name)
|
||||||
|
log.debug(subdir_path)
|
||||||
|
self.create_dir(subdir_path)
|
||||||
|
os.chdir(subdir_path)
|
||||||
|
self.download_folder_rec(subdir, subdir_path)
|
||||||
|
|
||||||
|
def download_course(self, course, base_dir):
|
||||||
|
log.info('crawling course ' + course)
|
||||||
|
self.create_dir(base_dir)
|
||||||
|
os.chdir(base_dir)
|
||||||
|
root = self.studip.get_top_folder(course)
|
||||||
|
self.download_folder_rec(root, base_dir)
|
||||||
|
|
||||||
|
def download_curr_courses(self, base_dir):
|
||||||
|
log.info('Start crawling all current courses')
|
||||||
|
self.create_dir(base_dir)
|
||||||
|
curr_courses = self.studip.get_curr_courses(
|
||||||
|
self.studip.get_uid(), self.studip.get_curr_semester())
|
||||||
|
os.chdir(base_dir)
|
||||||
|
for course in curr_courses:
|
||||||
|
log.debug('course is ' + curr_courses[course])
|
||||||
|
course_name = curr_courses[course].replace('/', '-')
|
||||||
|
path = os.path.join(base_dir, course_name)
|
||||||
|
self.download_course(course, path)
|
||||||
|
|
||||||
|
def create_dir(self, dir):
|
||||||
|
if not os.path.exists(dir):
|
||||||
|
log.info('creating folder' + dir)
|
||||||
|
os.mkdir(dir)
|
34
src/mysql.py
34
src/mysql.py
@ -1,27 +1,27 @@
|
|||||||
#!/bin/env python3
|
|
||||||
import time
|
import time
|
||||||
|
import logging as log
|
||||||
|
|
||||||
import pymysql
|
import pymysql
|
||||||
|
|
||||||
|
|
||||||
class Database:
|
class Database:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, host, port, name, user, passwd, reset_dl):
|
||||||
self.HOST = None
|
self.HOST = host
|
||||||
self.PORT = None
|
self.PORT = port
|
||||||
self.DB_NAME = None
|
self.NAME = name
|
||||||
self.USER = None
|
self.USER = user
|
||||||
self.PASSW = None
|
self.PASSWD = passwd
|
||||||
self.TABLE_FILE = None
|
self.RESET_DL = reset_dl
|
||||||
self.TABLE_FILE = 'files'
|
self.TABLE_FILE = 'files'
|
||||||
self.RESET_DL_DATE = False
|
self.setup_db()
|
||||||
|
|
||||||
def connect(self):
|
def connect(self):
|
||||||
return pymysql.connect(
|
return pymysql.connect(
|
||||||
host=self.HOST,
|
host=self.HOST,
|
||||||
port=self.PORT,
|
port=self.PORT,
|
||||||
user=self.USER,
|
user=self.USER,
|
||||||
password=self.PASSW,
|
password=self.PASSWD,
|
||||||
charset='utf8mb4',
|
charset='utf8mb4',
|
||||||
cursorclass=pymysql.cursors.DictCursor
|
cursorclass=pymysql.cursors.DictCursor
|
||||||
)
|
)
|
||||||
@ -29,21 +29,21 @@ class Database:
|
|||||||
def setup_db(self):
|
def setup_db(self):
|
||||||
db = self.connect()
|
db = self.connect()
|
||||||
crs = db.cursor()
|
crs = db.cursor()
|
||||||
sql_query = "CREATE DATABASE IF NOT EXISTS " + self.DB_NAME
|
sql_query = "CREATE DATABASE IF NOT EXISTS " + self.NAME
|
||||||
crs.execute(sql_query)
|
crs.execute(sql_query)
|
||||||
db.select_db(self.DB_NAME)
|
db.select_db(self.NAME)
|
||||||
query = "CREATE TABLE IF NOT EXISTS " + self.TABLE_FILE + \
|
query = "CREATE TABLE IF NOT EXISTS " + self.TABLE_FILE + \
|
||||||
"(id CHAR(32) NOT NULL," + \
|
"(id CHAR(32) NOT NULL," + \
|
||||||
"ch_date INT(11) NOT NULL," + \
|
"ch_date INT(11) NOT NULL," + \
|
||||||
"PRIMARY KEY(id))"
|
"PRIMARY KEY(id))"
|
||||||
crs.execute(query)
|
crs.execute(query)
|
||||||
print(db)
|
log.debug(db)
|
||||||
|
|
||||||
def set_last_file_dl(self, file_id, time):
|
def set_last_file_dl(self, file_id, time):
|
||||||
db = self.connect()
|
db = self.connect()
|
||||||
db.select_db(self.DB_NAME)
|
db.select_db(self.NAME)
|
||||||
crs = db.cursor()
|
crs = db.cursor()
|
||||||
print('file: ', file_id, ' time: ', time)
|
log.debug('file: ' + file_id + ' time: ' + time)
|
||||||
query = "INSERT INTO " + self.TABLE_FILE + "(`id`,`ch_date`)" + \
|
query = "INSERT INTO " + self.TABLE_FILE + "(`id`,`ch_date`)" + \
|
||||||
"VALUES ('" + file_id + "','" + time + "')" + \
|
"VALUES ('" + file_id + "','" + time + "')" + \
|
||||||
"ON DUPLICATE KEY UPDATE `ch_date` = '" + time + "'"
|
"ON DUPLICATE KEY UPDATE `ch_date` = '" + time + "'"
|
||||||
@ -51,10 +51,10 @@ class Database:
|
|||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
def get_last_file_dl(self, file_id):
|
def get_last_file_dl(self, file_id):
|
||||||
if self.RESET_DL_DATE:
|
if self.RESET_DL:
|
||||||
return None
|
return None
|
||||||
db = self.connect()
|
db = self.connect()
|
||||||
db.select_db(self.DB_NAME)
|
db.select_db(self.NAME)
|
||||||
crs = db.cursor()
|
crs = db.cursor()
|
||||||
query = "SELECT ch_date FROM files WHERE id ='" + file_id + "'"
|
query = "SELECT ch_date FROM files WHERE id ='" + file_id + "'"
|
||||||
crs.execute(query)
|
crs.execute(query)
|
||||||
|
60
src/run.py
60
src/run.py
@ -1,9 +1,12 @@
|
|||||||
#!/bin/env python3
|
#!/bin/env python3
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging as log
|
||||||
|
|
||||||
import studip
|
from studip import Studip
|
||||||
import mysql
|
from crawler import Crawler
|
||||||
|
from mysql import Database
|
||||||
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Download Files from StudIP.')
|
parser = argparse.ArgumentParser(description='Download Files from StudIP.')
|
||||||
@ -16,32 +19,49 @@ parser.add_argument('-p', '--passwd', type=str,
|
|||||||
parser.add_argument('-s', '--url', type=str, help='studip url', required=True)
|
parser.add_argument('-s', '--url', type=str, help='studip url', required=True)
|
||||||
parser.add_argument('--chunk', type=int, default=1024 *
|
parser.add_argument('--chunk', type=int, default=1024 *
|
||||||
1024, help='chunksize for downloading data')
|
1024, help='chunksize for downloading data')
|
||||||
parser.add_argument('-r', '--reset_dl_date', action='store_true', help='downloads everything and ignores last download date')
|
parser.add_argument('-r', '--reset_dl_date', action='store_true',
|
||||||
|
help='downloads everything and ignores last download date')
|
||||||
parser.add_argument('--host', type=str, default='localhost', help='mysql host')
|
parser.add_argument('--host', type=str, default='localhost', help='mysql host')
|
||||||
parser.add_argument('--port', type=int, default=3306, help='mysql port')
|
parser.add_argument('--port', type=int, default=3306, help='mysql port')
|
||||||
parser.add_argument('--db_name', type=str, default='studip', help='mysql database name')
|
parser.add_argument('--db_name', type=str, default='studip',
|
||||||
parser.add_argument('--db_user', type=str, default='root', help='mysql database user')
|
help='mysql database name')
|
||||||
parser.add_argument('--db_passwd', type=str, default='secret-pw', help='mysql database password')
|
parser.add_argument('--db_user', type=str, default='root',
|
||||||
|
help='mysql database user')
|
||||||
|
parser.add_argument('--db_passwd', type=str,
|
||||||
|
default='secret-pw', help='mysql database password')
|
||||||
|
parser.add_argument('-d', '--debug_output', action='store_true',
|
||||||
|
help='display debug information about the process')
|
||||||
|
parser.add_argument('-q', '--quiet', action='store_true',
|
||||||
|
help='only display most important output')
|
||||||
|
parser.add_argument('-l', '--log_file', action='store_true',
|
||||||
|
help='saves log to a log file named "log.txt"')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.quiet:
|
||||||
|
log_level = log.WARNING
|
||||||
|
elif args.debug_output:
|
||||||
|
log_level = log.DEBUG
|
||||||
|
else:
|
||||||
|
log_level = log.INFO
|
||||||
|
|
||||||
|
if args.log_file:
|
||||||
|
log.basicConfig(level=log_level, filename='log.txt')
|
||||||
|
else:
|
||||||
|
log.basicConfig(level=log_level)
|
||||||
|
|
||||||
BASE_DIR = os.path.abspath(args.output)
|
BASE_DIR = os.path.abspath(args.output)
|
||||||
USERNAME = args.user
|
USERNAME = args.user
|
||||||
PASSWORD = args.passwd
|
PASSWORD = args.passwd
|
||||||
|
|
||||||
db = mysql.Database()
|
db = Database(args.host, args.port, args.db_name,
|
||||||
|
args.db_user, args.db_passwd, args.reset_dl_date)
|
||||||
|
|
||||||
db.HOST = args.host
|
studip = Studip(args.chunk, args.url, (USERNAME, PASSWORD), db)
|
||||||
db.PORT = args.port
|
|
||||||
db.DB_NAME = args.db_name
|
|
||||||
db.USER = args.db_user
|
|
||||||
db.PASSW = args.db_passwd
|
|
||||||
db.RESET_DL_DATE = args.reset_dl_date
|
|
||||||
db.setup_db()
|
|
||||||
|
|
||||||
crwlr = studip.Crawler(db)
|
crawler = Crawler(studip)
|
||||||
|
|
||||||
crwlr.CHUNK_SIZE = args.chunk
|
# Start crawling
|
||||||
crwlr.STUDIP_DOMAIN = args.url
|
try:
|
||||||
crwlr.USER = (USERNAME, PASSWORD)
|
crawler.download_curr_courses(BASE_DIR)
|
||||||
|
except KeyboardInterrupt:
|
||||||
crwlr.download_curr_courses(BASE_DIR)
|
sys.exit(0)
|
||||||
|
@ -1,35 +1,30 @@
|
|||||||
#!/bin/env python3
|
|
||||||
import time
|
import time
|
||||||
import os
|
import logging as log
|
||||||
import argparse
|
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import requests as req
|
import requests as req
|
||||||
from requests.auth import HTTPBasicAuth
|
from requests.auth import HTTPBasicAuth
|
||||||
|
|
||||||
|
|
||||||
class Crawler:
|
class Studip:
|
||||||
|
|
||||||
def __init__(self, db):
|
def __init__(self, chunk_size, domain, user, db):
|
||||||
self.CHUNK_SIZE = None
|
self.CHUNK_SIZE = chunk_size
|
||||||
self.STUDIP_DOMAIN = None
|
self.DOMAIN = domain
|
||||||
self.USER = None
|
self.USER = user
|
||||||
self.db = db
|
self.db = db
|
||||||
|
|
||||||
def create_dir(self, dir):
|
def auth_req(self, url):
|
||||||
if not os.path.exists(dir):
|
url = self.DOMAIN + url
|
||||||
print('creating folder', dir)
|
return req.get(url, auth=self.USER)
|
||||||
os.mkdir(dir)
|
|
||||||
|
|
||||||
def get_uid(self):
|
def get_uid(self):
|
||||||
url = self.STUDIP_DOMAIN + '/api.php/user/'
|
rsp = self.auth_req('/api.php/user/')
|
||||||
rsp = req.get(url, auth=self.USER)
|
|
||||||
user_id = rsp.json()['user_id']
|
user_id = rsp.json()['user_id']
|
||||||
return user_id
|
return user_id
|
||||||
|
|
||||||
def get_curr_semester(self):
|
def get_curr_semester(self):
|
||||||
url = self.STUDIP_DOMAIN + '/api.php/semesters/'
|
rsp = self.auth_req('/api.php/semesters/')
|
||||||
rsp = req.get(url, auth=self.USER)
|
|
||||||
curr_time = int(str(int(time.time())))
|
curr_time = int(str(int(time.time())))
|
||||||
semesters = rsp.json()['collection']
|
semesters = rsp.json()['collection']
|
||||||
for sem_uri in semesters:
|
for sem_uri in semesters:
|
||||||
@ -41,8 +36,7 @@ class Crawler:
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
def get_ordered_semesters(self):
|
def get_ordered_semesters(self):
|
||||||
url = self.STUDIP_DOMAIN + '/api.php/semesters/'
|
rsp = self.auth_req('/api.php/semesters/')
|
||||||
rsp = req.get(url, auth=self.USER)
|
|
||||||
semesters = rsp.json()['collection']
|
semesters = rsp.json()['collection']
|
||||||
order_sems = []
|
order_sems = []
|
||||||
for sem_uri in semesters:
|
for sem_uri in semesters:
|
||||||
@ -50,8 +44,7 @@ class Crawler:
|
|||||||
return order_sems
|
return order_sems
|
||||||
|
|
||||||
def get_curr_courses(self, user_id, semester):
|
def get_curr_courses(self, user_id, semester):
|
||||||
url = self.STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses'
|
rsp = self.auth_req('/api.php/user/' + user_id + '/courses')
|
||||||
rsp = req.get(url, auth=self.USER)
|
|
||||||
ord_sems = self.get_ordered_semesters()
|
ord_sems = self.get_ordered_semesters()
|
||||||
courses = rsp.json()['collection']
|
courses = rsp.json()['collection']
|
||||||
i = 0
|
i = 0
|
||||||
@ -76,15 +69,13 @@ class Crawler:
|
|||||||
return course_list
|
return course_list
|
||||||
|
|
||||||
def get_top_folder(self, course):
|
def get_top_folder(self, course):
|
||||||
url = self.STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder'
|
rsp = self.auth_req('/api.php/course/' + course + '/top_folder')
|
||||||
rsp = req.get(url, auth=self.USER)
|
|
||||||
top_folder = rsp.json()
|
top_folder = rsp.json()
|
||||||
tf_id = top_folder['id']
|
tf_id = top_folder['id']
|
||||||
return(tf_id)
|
return(tf_id)
|
||||||
|
|
||||||
def get_docs(self, folder):
|
def get_docs(self, folder):
|
||||||
url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder
|
rsp = self.auth_req('/api.php/folder/' + folder)
|
||||||
rsp = req.get(url, auth=self.USER)
|
|
||||||
docs = rsp.json()['file_refs']
|
docs = rsp.json()['file_refs']
|
||||||
res_docs = []
|
res_docs = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
@ -93,16 +84,14 @@ class Crawler:
|
|||||||
return(res_docs)
|
return(res_docs)
|
||||||
|
|
||||||
def download(self, doc):
|
def download(self, doc):
|
||||||
url1 = self.STUDIP_DOMAIN + '/api.php/file/' + doc
|
rsp1 = self.auth_req('/api.php/file/' + doc)
|
||||||
rsp1 = req.get(url1, auth=self.USER)
|
|
||||||
doc_name = rsp1.json()['name']
|
doc_name = rsp1.json()['name']
|
||||||
doc_chdate = rsp1.json()['chdate']
|
doc_chdate = rsp1.json()['chdate']
|
||||||
last_dl = self.db.get_last_file_dl(doc)
|
last_dl = self.db.get_last_file_dl(doc)
|
||||||
if last_dl == None or last_dl < doc_chdate:
|
if last_dl == None or last_dl < doc_chdate:
|
||||||
print('downloading ', doc_name)
|
rsp2 = self.auth_req('/api.php/file/' + doc + '/download')
|
||||||
url2 = self.STUDIP_DOMAIN + '/api.php/file/' + doc + '/download'
|
|
||||||
rsp2 = req.get(url2, auth=self.USER, stream=True)
|
|
||||||
total_size = int(rsp2.headers.get('content-length', 0))
|
total_size = int(rsp2.headers.get('content-length', 0))
|
||||||
|
print('downloading ' + doc_name)
|
||||||
progbar = tqdm(total=total_size, unit='iB', unit_scale=True)
|
progbar = tqdm(total=total_size, unit='iB', unit_scale=True)
|
||||||
with open(doc_name, 'wb') as doc_file:
|
with open(doc_name, 'wb') as doc_file:
|
||||||
for chunk in rsp2.iter_content(self.CHUNK_SIZE):
|
for chunk in rsp2.iter_content(self.CHUNK_SIZE):
|
||||||
@ -111,8 +100,7 @@ class Crawler:
|
|||||||
self.db.set_last_file_dl(str(doc), str(int(time.time())))
|
self.db.set_last_file_dl(str(doc), str(int(time.time())))
|
||||||
|
|
||||||
def get_subdirs(self, folder):
|
def get_subdirs(self, folder):
|
||||||
url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder
|
rsp = self.auth_req('/api.php/folder/' + folder)
|
||||||
rsp = req.get(url, auth=self.USER)
|
|
||||||
subdirs = rsp.json()['subfolders']
|
subdirs = rsp.json()['subfolders']
|
||||||
docs = rsp.json()['file_refs']
|
docs = rsp.json()['file_refs']
|
||||||
res_subdirs = {}
|
res_subdirs = {}
|
||||||
@ -121,42 +109,3 @@ class Crawler:
|
|||||||
sub_name = subdir['name']
|
sub_name = subdir['name']
|
||||||
res_subdirs[sub_id] = sub_name
|
res_subdirs[sub_id] = sub_name
|
||||||
return res_subdirs
|
return res_subdirs
|
||||||
|
|
||||||
def download_folder(self, folder):
|
|
||||||
docs = self.get_docs(folder)
|
|
||||||
for doc in docs:
|
|
||||||
print('found doc ', doc)
|
|
||||||
self.download(doc)
|
|
||||||
|
|
||||||
def download_folder_rec(self, folder, base_dir):
|
|
||||||
print('folder ', folder)
|
|
||||||
self.create_dir(base_dir)
|
|
||||||
self.download_folder(folder)
|
|
||||||
subdirs = self.get_subdirs(folder)
|
|
||||||
os.chdir(base_dir)
|
|
||||||
for subdir in subdirs:
|
|
||||||
subdir_name = subdirs[subdir].replace('/', '-')
|
|
||||||
subdir_path = os.path.join(base_dir, subdir_name)
|
|
||||||
print(subdir_path)
|
|
||||||
self.create_dir(subdir_path)
|
|
||||||
os.chdir(subdir_path)
|
|
||||||
self.download_folder_rec(subdir, subdir_path)
|
|
||||||
|
|
||||||
def download_course(self, course, base_dir):
|
|
||||||
print('course ', course)
|
|
||||||
self.create_dir(base_dir)
|
|
||||||
os.chdir(base_dir)
|
|
||||||
root = self.get_top_folder(course)
|
|
||||||
self.download_folder_rec(root, base_dir)
|
|
||||||
|
|
||||||
def download_curr_courses(self, base_dir):
|
|
||||||
print('Start downloading all current courses')
|
|
||||||
self.create_dir(base_dir)
|
|
||||||
curr_courses = self.get_curr_courses(
|
|
||||||
self.get_uid(), self.get_curr_semester())
|
|
||||||
os.chdir(base_dir)
|
|
||||||
for course in curr_courses:
|
|
||||||
print('course is ', curr_courses[course])
|
|
||||||
course_name = curr_courses[course].replace('/', '-')
|
|
||||||
path = os.path.join(base_dir, course_name)
|
|
||||||
self.download_course(course, path)
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user