Source code for climind.fetchers.fetcher_promice

#  Climate indicator manager - a package for managing and building climate indicator dashboards.
#  Copyright (c) 2022 John Kennedy
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.

from pathlib import Path
import requests
import shutil
from bs4 import BeautifulSoup, SoupStrainer



[docs]
def fetch(url: str, outdir: Path, _):
    """
    Fetch Greenland mass balance data. The script scrapes a webpage in order to find the specific URLs
    of the latest version of the dataset (these change daily). These files are then downloaded. There should be
    two files: a daily file and an annual file.

    Parameters
    ----------
    url: srt
        URL of the directory which contains the files to be downloaded
    outdir: Path
        Path of the directory to which the output will be written

    Returns
    -------
    None
    """
    # First open up the landing page
    landing_page_url = "https://dataverse.geus.dk/api/datasets/:persistentId/" \
                       "dirindex?persistentId=doi:10.22008/FK2/OHI23Z"
    landing_page_request = requests.get(landing_page_url, stream=True, headers={'User-agent': 'Mozilla/5.0'})

    # Next scan through the landing page to find the links to the required files. The ULRs for these change, but
    # the contents of the <a> tags remains the same.
    for link in BeautifulSoup(landing_page_request.text, "html.parser", parse_only=SoupStrainer('a')):

        if ('MB_SMB_D_BMB.csv' in link.contents or 'MB_SMB_D_BMB_ann.csv' in link.contents):

            # File number is the number after the last slash in the API call in the <a> tag
            file_number = link['href'].split('/')[-1]
            filename = link.contents[0]

            file_url = f"{url}/{file_number}"
            out_path = outdir / filename

            # Setp up a request for the latest version of the target file
            file_request = requests.get(file_url, stream=True, headers={'User-agent': 'Mozilla/5.0'})

            if file_request.status_code == 200:
                with open(out_path, 'wb') as f:
                    file_request.raw.decode_content = True
                    shutil.copyfileobj(file_request.raw, f)