1:45 PM 11/12/2025 ���� JFIF    �� �        "" $(4,$&1'-=-157:::#+?D?8C49:7 7%%77777777777777777777777777777777777777777777777777��  { �" ��     �� 5    !1AQa"q�2��BR��#b�������  ��  ��   ? ��D@DDD@DDD@DDkK��6 �UG�4V�1�� �����릟�@�#���RY�dqp� ����� �o�7�m�s�<��VPS�e~V�چ8���X�T��$��c�� 9��ᘆ�m6@ WU�f�Don��r��5}9��}��hc�fF��/r=hi�� �͇�*�� b�.��$0�&te��y�@�A�F�=� Pf�A��a���˪�Œ�É��U|� � 3\�״ H SZ�g46�C��צ�ے �b<���;m����Rpع^��l7��*�����TF�}�\�M���M%�'�����٠ݽ�v� ��!-�����?�N!La��A+[`#���M����'�~oR�?��v^)��=��h����A��X�.���˃����^Ə��ܯsO"B�c>; �e�4��5�k��/CB��.  �J?��;�҈�������������������~�<�VZ�ꭼ2/)Í”jC���ע�V�G�!���!�F������\�� Kj�R�oc�h���:Þ I��1"2�q×°8��Р@ז���_C0�ր��A��lQ��@纼�!7��F�� �]�sZ B�62r�v�z~�K�7�c��5�.���ӄq&�Z�d�<�kk���T&8�|���I���� Ws}���ǽ�cqnΑ�_���3��|N�-y,��i���ȗ_�\60���@��6����D@DDD@DDD@DDD@DDD@DDc�KN66<�c��64=r����� ÄŽ0��h���t&(�hnb[� ?��^��\��â|�,�/h�\��R��5�? �0�!צ܉-����G����٬��Q�zA���1�����V��� �:R���`�$��ik��H����D4�����#dk����� h�}����7���w%�������*o8wG�LycuT�.���ܯ7��I��u^���)��/c�,s�Nq�ۺ�;�ך�YH2���.5B���DDD@DDD@DDD@DDD@DDD@V|�a�j{7c��X�F\�3MuA×¾hb� ��n��F������ ��8�(��e����Pp�\"G�`s��m��ާaW�K��O����|;ei����֋�[�q��";a��1����Y�G�W/�߇�&�<���Ќ�H'q�m���)�X+!���=�m�ۚ丷~6a^X�)���,�>#&6G���Y��{����"" """ """ """ """ ""��at\/�a�8 �yp%�lhl�n����)���i�t��B�������������?��modskinlienminh.com - WSOX ENC ‰PNG  IHDR Ÿ f Õ†C1 sRGB ®Îé gAMA ± üa pHYs à ÃÇo¨d GIDATx^íÜL”÷ð÷Yçªö("Bh_ò«®¸¢§q5kÖ*:þ0A­ºšÖ¥]VkJ¢M»¶f¸±8\k2íll£1]q®ÙÔ‚ÆT h25jguaT5*!‰PNG  IHDR Ÿ f Õ†C1 sRGB ®Îé gAMA ± üa pHYs à ÃÇo¨d GIDATx^íÜL”÷ð÷Yçªö("Bh_ò«®¸¢§q5kÖ*:þ0A­ºšÖ¥]VkJ¢M»¶f¸±8\k2íll£1]q®ÙÔ‚ÆT h25jguaT5*!
Warning: Undefined variable $authorization in C:\xampp\htdocs\demo\fi.php on line 57

Warning: Undefined variable $translation in C:\xampp\htdocs\demo\fi.php on line 118

Warning: Trying to access array offset on value of type null in C:\xampp\htdocs\demo\fi.php on line 119

Warning: file_get_contents(https://raw.githubusercontent.com/Den1xxx/Filemanager/master/languages/ru.json): Failed to open stream: HTTP request failed! HTTP/1.1 404 Not Found in C:\xampp\htdocs\demo\fi.php on line 120

Warning: Cannot modify header information - headers already sent by (output started at C:\xampp\htdocs\demo\fi.php:1) in C:\xampp\htdocs\demo\fi.php on line 247

Warning: Cannot modify header information - headers already sent by (output started at C:\xampp\htdocs\demo\fi.php:1) in C:\xampp\htdocs\demo\fi.php on line 248

Warning: Cannot modify header information - headers already sent by (output started at C:\xampp\htdocs\demo\fi.php:1) in C:\xampp\htdocs\demo\fi.php on line 249

Warning: Cannot modify header information - headers already sent by (output started at C:\xampp\htdocs\demo\fi.php:1) in C:\xampp\htdocs\demo\fi.php on line 250

Warning: Cannot modify header information - headers already sent by (output started at C:\xampp\htdocs\demo\fi.php:1) in C:\xampp\htdocs\demo\fi.php on line 251

Warning: Cannot modify header information - headers already sent by (output started at C:\xampp\htdocs\demo\fi.php:1) in C:\xampp\htdocs\demo\fi.php on line 252
"""Thames Water incident scraper. Scrapes https://www.thameswater.co.uk/network-latest for current incidents. The page is Next.js server-rendered with incident data embedded as JSON. Designed to run on a cron schedule (e.g. every 6 hours) to build up a historical incident database for InSAR validation. """ import csv import json import re from datetime import datetime from pathlib import Path import requests NETWORK_LATEST_URL = "https://www.thameswater.co.uk/network-latest" DB_PATH = Path("data/incidents/thames_water_scraped.csv") # Fields we want to capture from each incident CSV_FIELDS = [ "scraped_at", "incident_id", "title", "description", "status", "category", "location", "postcode", "latitude", "longitude", "start_date", "estimated_end_date", "raw_json", ] def fetch_incidents() -> list[dict]: """Fetch current incidents from Thames Water network-latest page. The page embeds incident data as JSON within the Next.js payload. Returns a list of incident dicts (may be empty if no active incidents). """ headers = { "User-Agent": "Exostrata-InSAR-Research/1.0 (incident-monitoring)", } resp = requests.get(NETWORK_LATEST_URL, headers=headers, timeout=30) resp.raise_for_status() html = resp.text incidents = [] # Strategy 1: Look for JSON incidents array in Next.js data payload # Pattern: "incidents":[{...},{...}] match = re.search(r'"incidents"\s*:\s*(\[.*?\])\s*[,}]', html) if match: try: incidents = json.loads(match.group(1)) except json.JSONDecodeError: pass # Strategy 2: Look for __NEXT_DATA__ script tag (common Next.js pattern) if not incidents: match = re.search( r']*id="__NEXT_DATA__"[^>]*>(.*?)', html, re.DOTALL, ) if match: try: next_data = json.loads(match.group(1)) # Navigate the Next.js data structure to find incidents props = next_data.get("props", {}).get("pageProps", {}) incidents = props.get("incidents", []) except (json.JSONDecodeError, AttributeError): pass return incidents def extract_fields(incident: dict, scraped_at: str) -> dict: """Extract standardised fields from a raw incident dict. The exact field names in Thames Water's data may vary — this maps common patterns to our CSV schema. """ # Try various field name patterns Thames Water might use row = { "scraped_at": scraped_at, "incident_id": ( incident.get("id") or incident.get("incidentId") or incident.get("reference", "") ), "title": incident.get("title", incident.get("name", "")), "description": incident.get("description", incident.get("summary", "")), "status": incident.get("status", incident.get("state", "")), "category": incident.get("category", incident.get("type", "")), "location": incident.get("location", incident.get("area", "")), "postcode": incident.get("postcode", incident.get("postalCode", "")), "latitude": incident.get("latitude", incident.get("lat", "")), "longitude": incident.get("longitude", incident.get("lng", "")), "start_date": incident.get("startDate", incident.get("createdAt", "")), "estimated_end_date": incident.get("estimatedEndDate", incident.get("eta", "")), "raw_json": json.dumps(incident, default=str), } return row def append_to_db(incidents: list[dict], db_path: Path = DB_PATH): """Append new incidents to the CSV database, skipping duplicates.""" db_path.parent.mkdir(parents=True, exist_ok=True) scraped_at = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") # Load existing incident IDs to avoid duplicates existing_ids = set() if db_path.exists(): with open(db_path, "r", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: key = f"{row.get('incident_id', '')}_{row.get('scraped_at', '')[:10]}" existing_ids.add(key) # Write header if file doesn't exist write_header = not db_path.exists() new_count = 0 with open(db_path, "a", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=CSV_FIELDS, extrasaction="ignore") if write_header: writer.writeheader() for incident in incidents: row = extract_fields(incident, scraped_at) key = f"{row['incident_id']}_{scraped_at[:10]}" if key not in existing_ids: writer.writerow(row) new_count += 1 return new_count def scrape_and_save() -> tuple[int, int]: """Run a full scrape cycle. Returns (total_found, new_saved).""" incidents = fetch_incidents() new_saved = 0 if incidents: new_saved = append_to_db(incidents) return len(incidents), new_saved if __name__ == "__main__": print(f"Scraping Thames Water incidents from {NETWORK_LATEST_URL}...") total, new = scrape_and_save() if total == 0: print("No active incidents currently listed.") else: print(f"Found {total} incidents, saved {new} new records to {DB_PATH}")