This program downloads all files of a Stud.IP users current semester.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

162 lines
5.7 KiB

  1. #!/bin/env python3
  2. import time
  3. import os
  4. import argparse
  5. from tqdm import tqdm
  6. import requests as req
  7. from requests.auth import HTTPBasicAuth
  8. class Crawler:
  9. def __init__(self, db):
  10. self.CHUNK_SIZE = None
  11. self.STUDIP_DOMAIN = None
  12. self.USER = None
  13. self.db = db
  14. def create_dir(self, dir):
  15. if not os.path.exists(dir):
  16. print('creating folder', dir)
  17. os.mkdir(dir)
  18. def get_uid(self):
  19. url = self.STUDIP_DOMAIN + '/api.php/user/'
  20. rsp = req.get(url, auth=self.USER)
  21. user_id = rsp.json()['user_id']
  22. return user_id
  23. def get_curr_semester(self):
  24. url = self.STUDIP_DOMAIN + '/api.php/semesters/'
  25. rsp = req.get(url, auth=self.USER)
  26. curr_time = int(str(int(time.time())))
  27. semesters = rsp.json()['collection']
  28. for sem_uri in semesters:
  29. semester = semesters[sem_uri]
  30. sem_begin = semester['begin']
  31. sem_end = semester['end']
  32. if sem_begin < curr_time < sem_end:
  33. return sem_uri
  34. return 0
  35. def get_ordered_semesters(self):
  36. url = self.STUDIP_DOMAIN + '/api.php/semesters/'
  37. rsp = req.get(url, auth=self.USER)
  38. semesters = rsp.json()['collection']
  39. order_sems = []
  40. for sem_uri in semesters:
  41. order_sems.append(sem_uri)
  42. return order_sems
  43. def get_curr_courses(self, user_id, semester):
  44. url = self.STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses'
  45. rsp = req.get(url, auth=self.USER)
  46. ord_sems = self.get_ordered_semesters()
  47. courses = rsp.json()['collection']
  48. i = 0
  49. course_list = {}
  50. for course_uri in courses:
  51. course = courses[course_uri]
  52. start_sem = course['start_semester']
  53. if start_sem != None:
  54. start_ind = ord_sems.index(start_sem)
  55. else:
  56. start_ind = 100
  57. end_sem = course['end_semester']
  58. if end_sem != None:
  59. end_ind = ord_sems.index(end_sem)
  60. else:
  61. end_ind = 100
  62. curr_ind = ord_sems.index(semester)
  63. if start_ind <= curr_ind <= end_ind:
  64. course_title = course['title']
  65. course_id = course['course_id']
  66. course_list[course_id] = course_title
  67. return course_list
  68. def get_top_folder(self, course):
  69. url = self.STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder'
  70. rsp = req.get(url, auth=self.USER)
  71. top_folder = rsp.json()
  72. tf_id = top_folder['id']
  73. return(tf_id)
  74. def get_docs(self, folder):
  75. url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder
  76. rsp = req.get(url, auth=self.USER)
  77. docs = rsp.json()['file_refs']
  78. res_docs = []
  79. for doc in docs:
  80. doc_id = doc['id']
  81. res_docs.append(doc_id)
  82. return(res_docs)
  83. def download(self, doc):
  84. url1 = self.STUDIP_DOMAIN + '/api.php/file/' + doc
  85. rsp1 = req.get(url1, auth=self.USER)
  86. doc_name = rsp1.json()['name']
  87. doc_chdate = rsp1.json()['chdate']
  88. last_dl = self.db.get_last_file_dl(doc)
  89. if last_dl == None or last_dl < doc_chdate:
  90. print('downloading ', doc_name)
  91. url2 = self.STUDIP_DOMAIN + '/api.php/file/' + doc + '/download'
  92. rsp2 = req.get(url2, auth=self.USER, stream=True)
  93. total_size = int(rsp2.headers.get('content-length', 0))
  94. progbar = tqdm(total=total_size, unit='iB', unit_scale=True)
  95. with open(doc_name, 'wb') as doc_file:
  96. for chunk in rsp2.iter_content(self.CHUNK_SIZE):
  97. progbar.update(len(chunk))
  98. doc_file.write(chunk)
  99. self.db.set_last_file_dl(str(doc), str(int(time.time())))
  100. def get_subdirs(self, folder):
  101. url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder
  102. rsp = req.get(url, auth=self.USER)
  103. subdirs = rsp.json()['subfolders']
  104. docs = rsp.json()['file_refs']
  105. res_subdirs = {}
  106. for subdir in subdirs:
  107. sub_id = subdir['id']
  108. sub_name = subdir['name']
  109. res_subdirs[sub_id] = sub_name
  110. return res_subdirs
  111. def download_folder(self, folder):
  112. docs = self.get_docs(folder)
  113. for doc in docs:
  114. print('found doc ', doc)
  115. self.download(doc)
  116. def download_folder_rec(self, folder, base_dir):
  117. print('folder ', folder)
  118. self.create_dir(base_dir)
  119. self.download_folder(folder)
  120. subdirs = self.get_subdirs(folder)
  121. os.chdir(base_dir)
  122. for subdir in subdirs:
  123. subdir_name = subdirs[subdir].replace('/', '-')
  124. subdir_path = os.path.join(base_dir, subdir_name)
  125. print(subdir_path)
  126. self.create_dir(subdir_path)
  127. os.chdir(subdir_path)
  128. self.download_folder_rec(subdir, subdir_path)
  129. def download_course(self, course, base_dir):
  130. print('course ', course)
  131. self.create_dir(base_dir)
  132. os.chdir(base_dir)
  133. root = self.get_top_folder(course)
  134. self.download_folder_rec(root, base_dir)
  135. def download_curr_courses(self, base_dir):
  136. print('Start downloading all current courses')
  137. self.create_dir(base_dir)
  138. curr_courses = self.get_curr_courses(
  139. self.get_uid(), self.get_curr_semester())
  140. os.chdir(base_dir)
  141. for course in curr_courses:
  142. print('course is ', curr_courses[course])
  143. course_name = curr_courses[course].replace('/', '-')
  144. path = os.path.join(base_dir, course_name)
  145. self.download_course(course, path)