# Climate indicator manager - a package for managing and building climate indicator dashboards.
# Copyright (c) 2023 John Kennedy
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import itertools
import re
from datetime import datetime
from pathlib import Path
import requests
import shutil
from bs4 import BeautifulSoup, SoupStrainer
from climind.fetchers.fetcher_utils import dir_and_filename_from_url, url_from_filename, get_n_months_back, \
fill_year_month
[docs]
def get_time_span(filled_url):
for back in [1, 3, 6, 9, 12, 99]:
if f'{back}month' in filled_url:
break
if back == 99:
raise ValueError("Filled URL does not match one of the allowed time spans")
return back
[docs]
def get_file(filled_url, out_path):
try:
r = requests.get(filled_url, stream=True, headers={'User-agent': 'Mozilla/5.0'})
if r.status_code == 200:
with open(out_path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
except requests.exceptions.ConnectionError:
print(f"Couldn't connect to {filled_url}")
[docs]
def fetch(url: str, outdir: Path, _) -> None:
"""
Fetch GPCC quantile data. The script scrapes the directory specified in the URL for a file
that matches the pattern specified in the URL.
Parameters
----------
url: str
URL of the file to be downloaded, containing wildcards for information that needs to be matched
on a case by case basis.
outdir: Path
Path of the directory to which the output will be written
Returns
-------
None
"""
# substitute YYYY and MMMM
now = datetime.now()
this_year = now.year
for y, m in itertools.product(range(1982, this_year + 1), range(1, 13)):
# Construct the filename for the year and month which covers a 12 month period
filled_url = fill_year_month(url, y, m)
back = get_time_span(filled_url)
y2, m2 = get_n_months_back(y, m, back=back)
filled_url = filled_url.replace('*', f'{y2}{m2:02d}')
_, filename = dir_and_filename_from_url(filled_url)
out_path = outdir / filename
# Need to scoop up the past two years to make sure we get any updates from first guess to monitoring
if not (out_path.exists()) or (y >= this_year - 1):
get_file(filled_url, out_path)