import requests, os, duckdb, zipfile, shutil, json, re
from datetime import datetime, timedelta
from pathlib import Path
from sys import argv

data_root = "/var/www/discorddump"
db_name = "discord.duckdb"
table_name = "discord"

def download_file(url, local_filename):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(local_filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        return True
    else:
        print(f"Failed to download {url}")
        return False

def generate_date_range(start_date, end_date):
    current_date = start_date
    while current_date <= end_date:
        yield current_date
        current_date += timedelta(days=1)

def download(start_date_str, end_date_str, base_url):
    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
    stats = [0,0,0] #success,skip,fail
    sources = []
    for date in generate_date_range(start_date, end_date):
        date_str = date.strftime('%Y-%m-%d')
        # Create the URL for the current date
        url = f"{base_url}/sor-discord-netherlands-bv-{date_str}-full.zip"
        # Create the local filename
        local_filename = f"{data_root}/tmp/dl/discorddump-dsa-{date_str}.zip"
        # files get backed up to sources/
        if os.path.isfile(f"{data_root}/sources/discorddump-dsa-{date_str}.zip") or os.path.isfile(local_filename):
            stats[1] += 1;
            continue
        print(f"Fetching {url}", end=('\r' if date_str != end_date_str else '\n'))
        # Download the file
        if download_file(url, local_filename):
            stats[0] += 1;
            sources.append(local_filename)
        else:
            stats[2] += 1;
    print(f"Done! Downloaded {stats[0]}, skipped {stats[1]}, failed {stats[2]} files.")
    return sources

def update_db():
    zip_dir = Path(f"{data_root}/tmp/dl")
    extracted_dir = Path(f"{data_root}/tmp/ex")
    # extract inner and outer, skipping already-extracted files
    #TODO: multithread this, it takes ages
    print("Decompressing... (this might take a few moments)")
    for outer_zip in zip_dir.glob('*.zip'):
        with zipfile.ZipFile(outer_zip, 'r') as zip_ref:
            for file in zip_ref.namelist():
                file_path = extracted_dir / file
                if not file_path.exists():
                    zip_ref.extract(file, extracted_dir)
    for inner_zip in extracted_dir.glob('*.zip'):
        with zipfile.ZipFile(inner_zip, 'r') as zip_ref:
            for file in zip_ref.namelist():
                file_path = extracted_dir / file
                if not file_path.exists():
                    zip_ref.extract(file, extracted_dir)

    # we now have a pile of CSVs in extracted/, add these to the database
    if not os.path.isfile(f"{data_root}/{db_name}"):
        print(f"No database found, creating one at `{data_root}/{db_name}`")
        con = duckdb.connect(f"{data_root}/{db_name}")
        # TODO don't make booleans (which are Yes/No in the csv) a VARCHAR
        con.execute("""
            CREATE TABLE discord (
                uuid                             VARCHAR,
                decision_visibility              VARCHAR,
                decision_visibility_other        VARCHAR,
                end_date_visibility_restriction  VARCHAR,
                decision_monetary                VARCHAR,
                decision_monetary_other          VARCHAR,
                end_date_monetary_restriction    VARCHAR,
                decision_provision               VARCHAR,
                end_date_service_restriction     VARCHAR,
                decision_account                 VARCHAR,
                end_date_account_restriction     VARCHAR,
                account_type                     VARCHAR,
                decision_ground                  VARCHAR,
                decision_ground_reference_url    VARCHAR,
                illegal_content_legal_ground     VARCHAR,
                illegal_content_explanation      VARCHAR,
                incompatible_content_ground      VARCHAR,
                incompatible_content_explanation VARCHAR,
                incompatible_content_illegal     VARCHAR,
                category                         VARCHAR,
                category_addition                VARCHAR,
                category_specification           VARCHAR,
                category_specification_other     VARCHAR,
                content_type                     VARCHAR,
                content_type_other               VARCHAR,
                content_language                 VARCHAR,
                content_date                     VARCHAR,
                territorial_scope                VARCHAR,
                application_date                 VARCHAR,
                decision_facts                   VARCHAR,
                source_type                      VARCHAR,
                source_identity                  VARCHAR,
                automated_detection              VARCHAR,
                automated_decision               VARCHAR,
                platform_name                    VARCHAR,
                platform_uid                     VARCHAR,
                created_at                       VARCHAR
            );
        """)
        con.close()
    con = duckdb.connect(f"{data_root}/{db_name}")
    filelist = [os.path.join(f"{data_root}/tmp/ex", f) for f in os.listdir(f"{data_root}/tmp/ex") if os.path.isfile(os.path.join(f"{data_root}/tmp/ex", f))]
    filelist = [f for f in filelist if os.path.splitext(f)[1] == (".csv")]
    if filelist != []:
        print(f"Adding {len(filelist)} CSV files to the DB... (this might take a few moments)")
        con.execute(f"""
            INSERT INTO {table_name}
            SELECT * FROM '{data_root}/tmp/ex/*.csv';
        """)
        con.close()

def convert_bytes(num):
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0

def cleanup(start_date, end_date):
    print("Cleaning temporary files...")
    if not os.path.isdir(f"{data_root}/sources"):
        os.mkdir(f"{data_root}/sources")
    for zip_file in Path(f"{data_root}/tmp/dl").glob('*.zip'):
        shutil.move(str(zip_file), Path(f"{data_root}/sources/{zip_file.name}"))

    shutil.rmtree(f"{data_root}/tmp")
    print(f"Done! Raw files from {start_date} to {end_date} stored in {data_root}/sources; Database at {data_root}/{db_name}")

def maybe_diverge_arg():
    next_query = False
    for arg in argv[1::]:
        if arg in ["-h", "--help"]:
            print("discord-dsv.py - 2025-05-07\n\
Script to automatically keep a database of discord's leaked data up-to-date.\n\
-h/--help: Display this help\n\
-i/--info: Display info about the database\n\
-d/--dedup: Deduplicate database (this should not be needed but i suck at this)\n\
-q/--query: Query database for user\n\
default behavior: Update database")
            exit(0)
        elif arg in ["-i", "--info"]:
            db_info()
            exit(0)
        elif arg in ["-d", "--dedup"]:
            dedup()
            exit(0)
        elif arg in ["-q", "--query"]:
            next_query = True
        elif next_query:
            query(arg)
            exit(0)
        else:
            print("Unknown argument! Use --help to see valid arguments.")
            exit(1)
    if next_query == True:
        print("--query requires a discord user ID! Use --help to see valid usage.")

# this one returns JSON
def query(userid):
    # can't just check if its a number
    # 1 would get millions of matches and break the server
    if re.fullmatch(r'\d{17,20}', userid) is None:
        print("{\"error\": \"User ID invalid, must consist of 17 to 20 digits\"}")
        exit(1)
    con = duckdb.connect(f"{data_root}/{db_name}")
    rows = con.execute(f"""
        SELECT to_json(t) AS row_json
        FROM (
            SELECT * FROM {table_name} WHERE platform_uid LIKE '%{userid}%'
        ) AS t;
    """).fetchall()
    json_array = [json.loads(row[0]) for row in rows]
    print(json.dumps(json_array))

def db_info():
    if os.path.isfile(f"{data_root}/{db_name}"):
        size = os.stat(f"{data_root}/{db_name}").st_size
        print(f"Database size : {convert_bytes(size)}")
        con = duckdb.connect(f"{data_root}/{db_name}")
        row_count = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
        print(f"Entries       : {row_count}")
        user_count = con.execute(f"SELECT COUNT(DISTINCT split_part(platform_uid, '-', 2)) FROM {table_name}").fetchone()[0]
        print(f"Distinct users: {user_count}")
        con.close()
    else:
        print("No database present!")

def dedup():
    print("Deduplicating...")
    con = duckdb.connect(f"{data_root}/{db_name}")
    # create copy with only unique rows
    old_size = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
    con.execute(f"""
        CREATE TABLE temp_table AS 
        SELECT DISTINCT * FROM {table_name};
    """)
    # replace table with deduplicated table
    con.execute(f"DROP TABLE {table_name};")
    con.execute(f"ALTER TABLE temp_table RENAME TO {table_name};")
    new_size = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
    con.commit()
    con.close()
    print(f"Old entry count: {old_size}\nNew entry count: {new_size} (removed {old_size-new_size} duplicate entries)")

if __name__ == "__main__":
    maybe_diverge_arg()
    try:
        os.mkdir(f"{data_root}/tmp")
        os.mkdir(f"{data_root}/tmp/dl")
        os.mkdir(f"{data_root}/tmp/ex")
    except FileExistsError:
        print("Detected previous interrupted run, attempting to resume.")
    # don't change! oldest file available from DSA data dumps
    start_date = "2024-07-09"
    # data is uploaded with one day delay. may break due to timezone fuckery, i refuse to deal with that.
    end_date = (datetime.today()-timedelta(1)).strftime('%Y-%m-%d')
    print(f"Fetching data from {start_date} to {end_date}. Ensure you are connected to a VPN (or the McDonalds WiFi, thats fine too :3)")
    res = input("Continue? [y/N]: ")
    base_url = "https://dsa-sor-data-dumps.s3.eu-central-1.amazonaws.com"
    if res in ["y", "Y"]:
        download(start_date, end_date, base_url)
        update_db()
        cleanup(start_date, end_date)
    else:
        print("Quitting...")