"""
My solutions to Lab02, AKBC 2022 course at UdS.
Author: Tuan-Phong Nguyen
"""

import csv
import json
import re
from pprint import pprint
from typing import Dict, List, Union

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm


def problem_1(name: str) -> List[Dict[str, Union[str, List[str]]]]:
    url = f"https://how-i-met-your-mother.fandom.com/wiki/{name}"
    page = requests.get(url)

    # replacing <br> with '\n' as bs4 struggles with this tag
    regex = re.compile(r"<br ?/?>", re.IGNORECASE)
    html = page.content.decode()
    newtext = re.sub(regex, "\n", html)

    soup = BeautifulSoup(newtext.encode(), "html.parser")

    infobox = soup.select_one('table[class="infobox character"]')

    tds = infobox.select("td")
    results = []
    for td in tds:
        divs = td.select("div")
        if len(divs) != 2:
            continue
        attr = divs[0].text.strip().rstrip(":")

        # Remove references
        for sup in divs[1].select("sup"):
            sup.decompose()

        # Try to split multiple values:
        # You might find out that there are lots of different ways that
        # multiple values are presented in the infoboxes.
        # There might be no exhaustive list of all the possibilities.
        # I only give one rule for the comma-separated values,
        # otherwise the default separator is an end-of-line character.
        value = divs[1].text.strip()
        if attr == "Also Known As":
            vals = value.split(",")
        else:
            vals = value.split("\n")

        if len(vals) > 1:
            value = [re.sub(r"\s+", " ", val).strip()
                     for val in vals if val.strip()]

        results.append({
            "attribute": attr,
            "value": value
        })

    return results


def get_courses(url):
    """Given URL of a category, return a list of its courses."""
    results = []
    page = requests.get(url + "&language=en")  # switch to English version
    soup = BeautifulSoup(page.content, "html.parser")
    table = soup.select_one(
        'table[summary="Übersicht über alle Veranstaltungen"]')
    rows = table.select("tr")
    for row in rows[1:]:
        a = row.select_one("td > a")
        course_name = a.text.strip()
        course_url = a["href"]
        results.append({
            "Name of Course": course_name,
            "URL": course_url,
        })
    return results


def problem_2_1() -> List[Dict[str, str]]:
    cat_urls = [
        "https://www.lsf.uni-saarland.de/qisserver/rds?state=wtree&search=1&trex=step&root120221=320944|310559|318658|311255|317545&P.vx=kurz",
        "https://www.lsf.uni-saarland.de/qisserver/rds?state=wtree&search=1&trex=step&root120221=320944|310559|318658|311255|318288&P.vx=kurz",
        "https://www.lsf.uni-saarland.de/qisserver/rds?state=wtree&search=1&trex=step&root120221=320944|310559|318658|311255|314705&P.vx=kurz",
        "https://www.lsf.uni-saarland.de/qisserver/rds?state=wtree&search=1&trex=step&root120221=320944|310559|318658|311255|316485&P.vx=kurz",
        "https://www.lsf.uni-saarland.de/qisserver/rds?state=wtree&search=1&trex=step&root120221=320944|310559|318658|311255|318717&P.vx=kurz",
    ]

    results = []
    for url in cat_urls:
        results.extend(get_courses(url))
    return results


def problem_2_2(url: str) -> Dict[str, Union[str, List[str]]]:
    page = requests.get(url + "&language=en")
    soup = BeautifulSoup(page.content, "html.parser")

    # Collect Basic Information
    table = soup.select_one('table[summary="Grunddaten zur Veranstaltung"]')

    props = {}

    ths = table.select("th")
    tds = table.select("td")

    id2attr = {}
    duplicate_idx = set()
    for th in ths:
        attr = th.text.strip()
        if not attr:
            continue
        idx = th["id"]

        if idx in id2attr:
            duplicate_idx.add(idx)
            continue

        id2attr[idx] = attr

    # Remove duplicate attributes, will revise later
    for idx in duplicate_idx:
        del id2attr[idx]

    for td in tds:
        idx = td["headers"][0]
        if idx not in id2attr:
            continue

        attr = id2attr[idx]
        value = td.text.replace(u'\xa0', ' ').strip()
        if attr == "application period":
            value = re.sub(r"\s+", " ", value)

        if attr not in props:
            props[attr] = []
        props[attr].append(value)

    for attr, value in props.items():
        if attr != "Additional Links":
            assert len(value) == 1
            props[attr] = value[0]

    # Revise duplicate attributes
    assert len(duplicate_idx) == 1  # should be only one duplicate (id=11)
    dup_id = list(duplicate_idx)[0]
    dup_ths = table.select(f"th[id={dup_id}]")
    dup_tds = table.select(f"td[headers={dup_id}]")
    assert len(dup_ths) == len(dup_tds)
    for th, td in zip(dup_ths, dup_tds):
        assert th["id"] == td["headers"][0]
        attr = th.text.strip()
        value = td.text.replace(u'\xa0', ' ').strip()
        assert attr not in props
        props[attr] = value

    # Collect Responsible Instructors
    instructors = []
    table_instructors = soup.select_one(
        'table[summary="Verantwortliche Dozenten"]')
    if table_instructors:
        for a in table_instructors.select("a"):
            instructors.append(a.text.replace(u'\xa0', ' ').strip())

    props["Responsible Instructors"] = instructors

    return props


def problem_2_3() -> None:
    # Collect (81) courses
    courses = []
    for course in tqdm(problem_2_1()):
        info = problem_2_2(course["URL"])
        info.update({
            "Name of Course": course["Name of Course"],
        })
        courses.append(info)

    # All (16) distinct fields
    fields = set()
    for course in courses:
        fields.update(course.keys())

    with open("courses.csv", "w+") as f:
        writer = csv.DictWriter(f, fieldnames=list(fields))
        writer.writeheader()
        for course in courses:
            course = course.copy()
            # Fill in missing fields with empty strings/lists
            for k in fields:
                if k not in course:
                    if k not in {"Additional Links", "Responsible Instructors"}:
                        course[k] = ""
                    else:
                        course[k] = []
            # Convert lists to JSON strings
            course["Additional Links"] = json.dumps(course["Additional Links"])
            course["Responsible Instructors"] = json.dumps(
                course["Responsible Instructors"])
            # Write row
            writer.writerow(course)


def main():
    # You can call your functions here to test their behaviours.
    # pprint(problem_1("Lily Aldrin"))
    problem_2_3()


if __name__ == "__main__":
    main()
