import re import os.path import subprocess import time from datetime import datetime, timedelta def download_entries_v1(): base_request = input( "Search for \"berichtswoche?\" and select the result initiated by 'polyfills*'. Copy as cURL and paste here\nDepending on your browser / terminal running this in, you might need to replace any \\, \\n (or \\\\\\n) with '' before pasting, as \\n can trigger the end of the input reading: ") start_date = datetime.strptime(input("enter start date to download (yyyy-mm-dd): "), "%Y-%m-%d") end_date = input("enter end date to download (yyyy-mm-dd): ") if end_date == "": end_date = datetime.now() else: end_date = datetime.strptime(end_date, "%Y-%m-%d") regex = r"(.*)(https:\/\/bildung\.service\.ihk\.de\/berichtsheft\/erstellen-api\/v1\/berichtswoche\?datum=)([0-9]*-[0-9]{2}-[0-9]{2})(.*)" current_date = start_date days_in_week = 7 start_next_week = (current_date + (timedelta(days=(days_in_week - current_date.weekday())))).date() while start_next_week < end_date.date(): start_next_week = (current_date + (timedelta(days=(days_in_week - current_date.weekday())))).date() week_start = (current_date - timedelta(days=current_date.weekday())).date() subst = f"\\g<1>\\g<2>{week_start}\\g<4>" result = re.sub(regex, subst, base_request, count=1, flags=re.DOTALL | re.MULTILINE) if result: file_name = f"week_from_{week_start}.json" if os.path.isfile(file_name): print(f"skipping already existing file {file_name}") current_date += timedelta(days_in_week) continue input_metadata_content = result.replace('curl ', f'curl -o {file_name} ') print(f'Fetching week {week_start}') subprocess.Popen(input_metadata_content, shell=True).wait() time.sleep(1) # not sure if there is ddos protection / spam2ban current_date += timedelta(days_in_week) else: print("oops, failed to find date part in cURL") break print( f"\nFetched all weeks from {start_date.date()} until {end_date.date()}\nNow backup the files located at {os.getcwd()} and wait until our own report book platform is finished") if __name__ == '__main__': download_entries_v1()