Source code for climind.fetchers.fetcher_promice

#  Climate indicator manager - a package for managing and building climate indicator dashboards.
#  Copyright (c) 2022 John Kennedy
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.

from pathlib import Path
import requests
import shutil
from bs4 import BeautifulSoup, SoupStrainer


[docs] def fetch(url: str, outdir: Path, _): """ Fetch Greenland mass balance data. The script scrapes a webpage in order to find the specific URLs of the latest version of the dataset (these change daily). These files are then downloaded. There should be two files: a daily file and an annual file. Parameters ---------- url: srt URL of the directory which contains the files to be downloaded outdir: Path Path of the directory to which the output will be written Returns ------- None """ # First open up the landing page landing_page_url = "https://dataverse.geus.dk/api/datasets/:persistentId/" \ "dirindex?persistentId=doi:10.22008/FK2/OHI23Z" landing_page_request = requests.get(landing_page_url, stream=True, headers={'User-agent': 'Mozilla/5.0'}) # Next scan through the landing page to find the links to the required files. The ULRs for these change, but # the contents of the <a> tags remains the same. for link in BeautifulSoup(landing_page_request.text, "html.parser", parse_only=SoupStrainer('a')): if ('MB_SMB_D_BMB.csv' in link.contents or 'MB_SMB_D_BMB_ann.csv' in link.contents): # File number is the number after the last slash in the API call in the <a> tag file_number = link['href'].split('/')[-1] filename = link.contents[0] file_url = f"{url}/{file_number}" out_path = outdir / filename # Setp up a request for the latest version of the target file file_request = requests.get(file_url, stream=True, headers={'User-agent': 'Mozilla/5.0'}) if file_request.status_code == 200: with open(out_path, 'wb') as f: file_request.raw.decode_content = True shutil.copyfileobj(file_request.raw, f)