Source code for climind.fetchers.fetcher_noaaglobaltemp

#  Climate indicator manager - a package for managing and building climate indicator dashboards.
#  Copyright (c) 2022 John Kennedy
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.

import re
from pathlib import Path
import requests
import shutil
from bs4 import BeautifulSoup, SoupStrainer

from climind.fetchers.fetcher_utils import dir_and_filename_from_url, url_from_filename


[docs] def fetch(url: str, outdir: Path, _) -> None: """ Fetch NOAAGlobalTemp data. The script scrapes the directory specified in the URL for a file that matches the pattern specified in the URL. Parameters ---------- url: str URL of the file to be downloaded, containing wildcards for information that needs to be matched on a case by case basis. outdir: Path Path of the directory to which the output will be written Returns ------- None """ dirname, filename = dir_and_filename_from_url(url) # get contents of the directory r = requests.get(dirname) # compile filename with wildcards into regular expression pattern = re.compile(filename) # get all <a> tags from the directory and find the one that matches our reg ex matched_file = None for link in BeautifulSoup(r.text, "html.parser", parse_only=SoupStrainer('a')): if link.has_attr('href'): if pattern.match(link['href']): matched_file = link['href'] # make the URL for the file that matches and the output file name to save to matched_url = url_from_filename(url, matched_file) out_path = outdir / matched_file # download the matching file r = requests.get(matched_url, stream=True, headers={'User-agent': 'Mozilla/5.0'}) if r.status_code == 200: with open(out_path, 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f)