YARB/main.py

import re
import os.path
import subprocess
import time
from datetime import datetime, timedelta


def download_entries_v1():
    base_request = input(
        "Search for \"berichtswoche?\" and select the result initiated by 'polyfills*'. Copy as cURL and paste here\nDepending on your browser / terminal running this in, you might need to replace any \\, \\n (or \\\\\\n) with '' before pasting, as \\n can trigger the end of the input reading: ")

    start_date = datetime.strptime(input("enter start date to download (yyyy-mm-dd): "), "%Y-%m-%d")
    end_date = input("enter end date to download (yyyy-mm-dd): ")

    if end_date == "":
        end_date = datetime.now()
    else:
        end_date = datetime.strptime(end_date, "%Y-%m-%d")

    regex = r"(.*)(https:\/\/bildung\.service\.ihk\.de\/berichtsheft\/erstellen-api\/v1\/berichtswoche\?datum=)([0-9]*-[0-9]{2}-[0-9]{2})(.*)"

    current_date = start_date

    days_in_week = 7

    start_next_week = (current_date + (timedelta(days=(days_in_week - current_date.weekday())))).date()

    while start_next_week < end_date.date():
        start_next_week = (current_date + (timedelta(days=(days_in_week - current_date.weekday())))).date()
        week_start = (current_date - timedelta(days=current_date.weekday())).date()

        subst = f"\\g<1>\\g<2>{week_start}\\g<4>"

        result = re.sub(regex, subst, base_request, count=1, flags=re.DOTALL | re.MULTILINE)

        if result:
            file_name = f"week_from_{week_start}.json"
            if os.path.isfile(file_name):
                print(f"skipping already existing file {file_name}")
                current_date += timedelta(days_in_week)
                continue

            input_metadata_content = result.replace('curl ', f'curl -o {file_name} ')

            print(f'Fetching week {week_start}')
            subprocess.Popen(input_metadata_content, shell=True).wait()
            time.sleep(1)  # not sure if there is ddos protection / spam2ban

            current_date += timedelta(days_in_week)
        else:
            print("oops, failed to find date part in cURL")
            break

    print(
        f"\nFetched all weeks from {start_date.date()} until {end_date.date()}\nNow backup the files located at {os.getcwd()} and wait until our own report book platform is finished")


if __name__ == '__main__':
    download_entries_v1()