Scrape Kita-Navigator into Excel

Berlin has this gorgeous tool to find Kitas: The Kita Navigator. However as far as I know, there is no way to extract the data in any format like excel or csv.

Since I am not the guy having a lot of fun, klicking through Web-GUIs I wrote a little script to extract all of the needed data. The page is unfortunately one of these webpages you can’t read without activated JS , so I had to use Selenium to scrape all the information out of the pages.

#!/usr/bin/env python3.8

import datetime

import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from import By
from import expected_conditions as EC
from import WebDriverWait

URL = ""\
OUTFILE_PATH = f"~/kitas_{'%Y%m%d')}.csv"

def pagination(driver):
    j = 1
    while True:
        print(f"Page [{j:2}]")
            nextbutton = driver.find_element_by_css_selector('[title="Link zur nächsten Seite."]')
            driver.execute_script("arguments[0].scrollIntoView();", nextbutton)
        except NoSuchElementException as err:
            if '[title="Link zur nächsten Seite."]' in str(err):
            raise err
        j += 1

def extract_buttons(driver):
    len_buttons = 12 if (x := len(driver.find_elements_by_css_selector('[class="btn btn-primary"]'))) > 12 else x
    for i in range(len_buttons):
        await_page(driver, EC.presence_of_element_located((By.CSS_SELECTOR, '[class="btn btn-primary"]')))
        button = driver.find_elements_by_css_selector('[class="btn btn-primary"]')[i]
            "return arguments[0].scrollIntoView(true);",
            f"\t \t -\t{button.get_attribute('title').replace('Die Detailansicht der Kita', '').replace('aufrufen', '')}")
        yield button.get_attribute('href')

def extract_elemts(driver):
    await_page(driver, EC.presence_of_element_located((By.CSS_SELECTOR, '[class="kita-name mb-0 align-self-center"]')))
    print(driver.find_element_by_css_selector('[class="kita-name mb-0 align-self-center"]').text)

    return {
        'name': driver.find_element_by_css_selector('[class="kita-name mb-0 align-self-center"]').text,

        'addresse': "" if not (
            x := driver.find_elements_by_css_selector(
                '[title="Externer Link Wegbeschreibung in Google Maps (Öffnet neuen Tab)"]')
        ) else x[0].text,

        'maps_link': "" if not (
            x := driver.find_elements_by_css_selector(
                '[title="Externer Link Wegbeschreibung in Google Maps (Öffnet neuen Tab)"]')
        ) else x[0].text,

        'telefonnummer': "" if not (
            x := driver.find_elements_by_css_selector('[title="Die Kita anrufen"]')
        ) else x[0].text,

        'email': "" if not (
            x := driver.find_elements_by_css_selector('[title="Eine E-Mail an die Kita senden"]')
        ) else x[0],

        'homepage': "" if not (
            x := driver.find_elements_by_css_selector('[title="Externer Link Homepage der Kita (Öffnet neuen Tab)"]')
        ) else x[0].text,

        'entfernung_km': float(driver.find_element_by_xpath(
            "//*[contains(text(), 'Entfernung')]/following-sibling::p").text.replace(" km", "")),

        'aufnahmealter': driver.find_element_by_xpath(
            "//*[contains(text(), 'Aufnahmealter ab')]/following-sibling::p").text,

        'platzangebot': driver.find_element_by_xpath(
            "//*[contains(text(), 'Maximales Platzangebot')]/following-sibling::p").text,

        'pädagogischer_ansatz': "" if not (
            x := driver.find_elements_by_xpath("//*[contains(text(), 'Pädagogischer Ansatz')]/following-sibling::ul")
        ) else x[0].text,

        'altersstruktur': "" if not (
            x := driver.find_elements_by_xpath(
                "//*[contains(text(), 'Altersstruktur')]/following-sibling::ul")
        ) else x[0].text,

        'thematische_schwerpunkte': "" if not (
            x := driver.find_elements_by_xpath(
                "//*[contains(text(), 'Thematische Schwerpunkte')]/following-sibling::ul")
        ) else x[0].text,

        'Öffnungszeiten': driver.find_element_by_xpath(
            "//*[contains(text(), 'Öffnungszeiten')]/following-sibling::p").text}

def await_page(driver, elem):
    for t in range(2):
            WebDriverWait(driver, 10).until(elem)
        except TimeoutException:
    raise TimeoutException(f"Can't find element {elem}")

def run_extraction():
    options = webdriver.FirefoxOptions()
    options.headless = True
    with webdriver.Firefox(options=options) as driver:
        await_page(driver, EC.presence_of_element_located((By.CSS_SELECTOR, '[class="d-inline-flex mr-1"]')))

        print("\nScraping links to Kita pages ...")
        hrefs = [href for _ in pagination(driver) for href in extract_buttons(driver)]
        print(f"\nExtracting informations  {len(hrefs)} Kitas ...")
        for i, href in enumerate(hrefs):
                elemts = extract_elemts(driver)
                print(f"{i + 1}/{len(hrefs)}\n\t{elemts}\n{'- ' * 40}\n")
                yield elemts
            except NoSuchElementException as err:
                print(f"{i + 1}/{len(hrefs)}\n\t{err}\t{href}\n{'- ' * 40}\n")

def main():
    kitas = [kita for kita in run_extraction()]

if __name__ == '__main__':