Scrape Kita-Navigator into Excel

Berlin has this gorgeous tool to find Kitas: The Kita Navigator. However as far as I know, there is no way to extract the data in any format like excel or csv.

Since I am not the guy having a lot of fun, klicking through Web-GUIs I wrote a little script to extract all of the needed data. The page is unfortunately one of these webpages you can’t read without activated JS , so I had to use Selenium to scrape all the information out of the pages.

#!/usr/bin/env python3.8

import datetime

import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

URL = "https://kita-navigator.berlin.de/einrichtungen?"\
      "input=Ritterstra%C3%9Fe%2C%20Berlin%2C%20Deutschland&"\
      "betb=3-2020&"\
      "einfacheSuche=true&"\
      "entfernung=10&"\
      "lat=52.50256599999999&"\
      "lon=13.4064448"\
OUTFILE_PATH = f"~/kitas_{datetime.datetime.now().strftime('%Y%m%d')}.csv"


def pagination(driver):
    j = 1
    while True:
        print(f"Page [{j:2}]")
        yield
        try:
            nextbutton = driver.find_element_by_css_selector('[title="Link zur nächsten Seite."]')
            driver.execute_script("arguments[0].scrollIntoView();", nextbutton)
            nextbutton.click()
        except NoSuchElementException as err:
            if '[title="Link zur nächsten Seite."]' in str(err):
                break
            raise err
        j += 1


def extract_buttons(driver):
    len_buttons = 12 if (x := len(driver.find_elements_by_css_selector('[class="btn btn-primary"]'))) > 12 else x
    for i in range(len_buttons):
        await_page(driver, EC.presence_of_element_located((By.CSS_SELECTOR, '[class="btn btn-primary"]')))
        button = driver.find_elements_by_css_selector('[class="btn btn-primary"]')[i]
        driver.execute_script(
            "return arguments[0].scrollIntoView(true);",
            button.find_element_by_xpath('../../../../..')
        )
        print(
            f"\t \t -\t{button.get_attribute('title').replace('Die Detailansicht der Kita', '').replace('aufrufen', '')}")
        yield button.get_attribute('href')


def extract_elemts(driver):
    await_page(driver, EC.presence_of_element_located((By.CSS_SELECTOR, '[class="kita-name mb-0 align-self-center"]')))
    print(driver.find_element_by_css_selector('[class="kita-name mb-0 align-self-center"]').text)

    return {
        'name': driver.find_element_by_css_selector('[class="kita-name mb-0 align-self-center"]').text,

        'addresse': "" if not (
            x := driver.find_elements_by_css_selector(
                '[title="Externer Link Wegbeschreibung in Google Maps (Öffnet neuen Tab)"]')
        ) else x[0].text,

        'maps_link': "" if not (
            x := driver.find_elements_by_css_selector(
                '[title="Externer Link Wegbeschreibung in Google Maps (Öffnet neuen Tab)"]')
        ) else x[0].text,

        'telefonnummer': "" if not (
            x := driver.find_elements_by_css_selector('[title="Die Kita anrufen"]')
        ) else x[0].text,

        'email': "" if not (
            x := driver.find_elements_by_css_selector('[title="Eine E-Mail an die Kita senden"]')
        ) else x[0],

        'homepage': "" if not (
            x := driver.find_elements_by_css_selector('[title="Externer Link Homepage der Kita (Öffnet neuen Tab)"]')
        ) else x[0].text,

        'entfernung_km': float(driver.find_element_by_xpath(
            "//*[contains(text(), 'Entfernung')]/following-sibling::p").text.replace(" km", "")),

        'aufnahmealter': driver.find_element_by_xpath(
            "//*[contains(text(), 'Aufnahmealter ab')]/following-sibling::p").text,

        'platzangebot': driver.find_element_by_xpath(
            "//*[contains(text(), 'Maximales Platzangebot')]/following-sibling::p").text,

        'pädagogischer_ansatz': "" if not (
            x := driver.find_elements_by_xpath("//*[contains(text(), 'Pädagogischer Ansatz')]/following-sibling::ul")
        ) else x[0].text,

        'altersstruktur': "" if not (
            x := driver.find_elements_by_xpath(
                "//*[contains(text(), 'Altersstruktur')]/following-sibling::ul")
        ) else x[0].text,

        'thematische_schwerpunkte': "" if not (
            x := driver.find_elements_by_xpath(
                "//*[contains(text(), 'Thematische Schwerpunkte')]/following-sibling::ul")
        ) else x[0].text,

        'Öffnungszeiten': driver.find_element_by_xpath(
            "//*[contains(text(), 'Öffnungszeiten')]/following-sibling::p").text}


def await_page(driver, elem):
    for t in range(2):
        try:
            WebDriverWait(driver, 10).until(elem)
            return
        except TimeoutException:
            driver.refresh()
            continue
    raise TimeoutException(f"Can't find element {elem}")


def run_extraction():
    options = webdriver.FirefoxOptions()
    options.headless = True
   
    with webdriver.Firefox(options=options) as driver:
        driver.get(URL)
        await_page(driver, EC.presence_of_element_located((By.CSS_SELECTOR, '[class="d-inline-flex mr-1"]')))

        print("\nScraping links to Kita pages ...")
        hrefs = [href for _ in pagination(driver) for href in extract_buttons(driver)]
        
        print(f"\nExtracting informations  {len(hrefs)} Kitas ...")
        for i, href in enumerate(hrefs):
            driver.get(href)
            try:
                elemts = extract_elemts(driver)
                print(f"{i + 1}/{len(hrefs)}\n\t{elemts}\n{'- ' * 40}\n")
                yield elemts
            except NoSuchElementException as err:
                print(f"{i + 1}/{len(hrefs)}\n\t{err}\t{href}\n{'- ' * 40}\n")
                continue


def main():
    kitas = [kita for kita in run_extraction()]
    pd.DataFrame(kitas).to_csv(OUTFILE_PATH)


if __name__ == '__main__':
    main()