Scrape Kita-Navigator into Excel
Berlin has this gorgeous tool to find Kitas: The Kita Navigator. However as far as I know, there is no way to extract the data in any format like excel or csv.
Since I am not the guy having a lot of fun, klicking through Web-GUIs I wrote a little script to extract all of the needed data. The page is unfortunately one of these webpages you can’t read without activated JS , so I had to use Selenium to scrape all the information out of the pages.
#!/usr/bin/env python3.8
import datetime
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
URL = "https://kita-navigator.berlin.de/einrichtungen?"\
"input=Ritterstra%C3%9Fe%2C%20Berlin%2C%20Deutschland&"\
"betb=3-2020&"\
"einfacheSuche=true&"\
"entfernung=10&"\
"lat=52.50256599999999&"\
"lon=13.4064448"\
OUTFILE_PATH = f"~/kitas_{datetime.datetime.now().strftime('%Y%m%d')}.csv"
def pagination(driver):
j = 1
while True:
print(f"Page [{j:2}]")
yield
try:
nextbutton = driver.find_element_by_css_selector('[title="Link zur nächsten Seite."]')
driver.execute_script("arguments[0].scrollIntoView();", nextbutton)
nextbutton.click()
except NoSuchElementException as err:
if '[title="Link zur nächsten Seite."]' in str(err):
break
raise err
j += 1
def extract_buttons(driver):
len_buttons = 12 if (x := len(driver.find_elements_by_css_selector('[class="btn btn-primary"]'))) > 12 else x
for i in range(len_buttons):
await_page(driver, EC.presence_of_element_located((By.CSS_SELECTOR, '[class="btn btn-primary"]')))
button = driver.find_elements_by_css_selector('[class="btn btn-primary"]')[i]
driver.execute_script(
"return arguments[0].scrollIntoView(true);",
button.find_element_by_xpath('../../../../..')
)
print(
f"\t \t -\t{button.get_attribute('title').replace('Die Detailansicht der Kita', '').replace('aufrufen', '')}")
yield button.get_attribute('href')
def extract_elemts(driver):
await_page(driver, EC.presence_of_element_located((By.CSS_SELECTOR, '[class="kita-name mb-0 align-self-center"]')))
print(driver.find_element_by_css_selector('[class="kita-name mb-0 align-self-center"]').text)
return {
'name': driver.find_element_by_css_selector('[class="kita-name mb-0 align-self-center"]').text,
'addresse': "" if not (
x := driver.find_elements_by_css_selector(
'[title="Externer Link Wegbeschreibung in Google Maps (Öffnet neuen Tab)"]')
) else x[0].text,
'maps_link': "" if not (
x := driver.find_elements_by_css_selector(
'[title="Externer Link Wegbeschreibung in Google Maps (Öffnet neuen Tab)"]')
) else x[0].text,
'telefonnummer': "" if not (
x := driver.find_elements_by_css_selector('[title="Die Kita anrufen"]')
) else x[0].text,
'email': "" if not (
x := driver.find_elements_by_css_selector('[title="Eine E-Mail an die Kita senden"]')
) else x[0],
'homepage': "" if not (
x := driver.find_elements_by_css_selector('[title="Externer Link Homepage der Kita (Öffnet neuen Tab)"]')
) else x[0].text,
'entfernung_km': float(driver.find_element_by_xpath(
"//*[contains(text(), 'Entfernung')]/following-sibling::p").text.replace(" km", "")),
'aufnahmealter': driver.find_element_by_xpath(
"//*[contains(text(), 'Aufnahmealter ab')]/following-sibling::p").text,
'platzangebot': driver.find_element_by_xpath(
"//*[contains(text(), 'Maximales Platzangebot')]/following-sibling::p").text,
'pädagogischer_ansatz': "" if not (
x := driver.find_elements_by_xpath("//*[contains(text(), 'Pädagogischer Ansatz')]/following-sibling::ul")
) else x[0].text,
'altersstruktur': "" if not (
x := driver.find_elements_by_xpath(
"//*[contains(text(), 'Altersstruktur')]/following-sibling::ul")
) else x[0].text,
'thematische_schwerpunkte': "" if not (
x := driver.find_elements_by_xpath(
"//*[contains(text(), 'Thematische Schwerpunkte')]/following-sibling::ul")
) else x[0].text,
'Öffnungszeiten': driver.find_element_by_xpath(
"//*[contains(text(), 'Öffnungszeiten')]/following-sibling::p").text}
def await_page(driver, elem):
for t in range(2):
try:
WebDriverWait(driver, 10).until(elem)
return
except TimeoutException:
driver.refresh()
continue
raise TimeoutException(f"Can't find element {elem}")
def run_extraction():
options = webdriver.FirefoxOptions()
options.headless = True
with webdriver.Firefox(options=options) as driver:
driver.get(URL)
await_page(driver, EC.presence_of_element_located((By.CSS_SELECTOR, '[class="d-inline-flex mr-1"]')))
print("\nScraping links to Kita pages ...")
hrefs = [href for _ in pagination(driver) for href in extract_buttons(driver)]
print(f"\nExtracting informations {len(hrefs)} Kitas ...")
for i, href in enumerate(hrefs):
driver.get(href)
try:
elemts = extract_elemts(driver)
print(f"{i + 1}/{len(hrefs)}\n\t{elemts}\n{'- ' * 40}\n")
yield elemts
except NoSuchElementException as err:
print(f"{i + 1}/{len(hrefs)}\n\t{err}\t{href}\n{'- ' * 40}\n")
continue
def main():
kitas = [kita for kita in run_extraction()]
pd.DataFrame(kitas).to_csv(OUTFILE_PATH)
if __name__ == '__main__':
main()