Source code for climind.web.extract_from_word

#  Climate indicator manager - a package for managing and building climate indicator dashboards.
#  Copyright (c) 2022 John Kennedy
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.

import unicodedata
from pathlib import Path
from docx import Document
from docx.text.paragraph import Paragraph
from docx.oxml.ns import qn

import re

Paragraph.text = property(lambda self: GetParagraphText(self))



[docs]
def GetParagraphText(paragraph):
    def GetTag(element):
        return "%s:%s" % (element.prefix, re.match("{.*}(.*)", element.tag).group(1))

    text = ''
    runCount = 0
    linkCount = 0
    for child in paragraph._p:
        tag = GetTag(child)
        if tag == "w:r":
            text += paragraph.runs[runCount].text
            runCount += 1
        if tag == "w:hyperlink":
            for subChild in child:
                if GetTag(subChild) == "w:r":
                    text += f"|{linkCount}|"

    return text




[docs]
def split_document(document_name):
    document = Document(document_name)

    headings = []
    text_blocks = []
    new_block = []

    for paragraph in document.paragraphs:

        paragraph_links = []
        for link in paragraph._element.xpath(".//w:hyperlink"):
            inner_run = link.xpath("w:r")[0]#, namespaces=link.nsmap)[0]
            link_text = inner_run.text
            rId = link.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id")
            link_url = document._part.rels[rId]._target
            paragraph_links.append(f'<a href="{link_url}" target="_blank"  rel="noopener">{link_text}</a>')

        if paragraph.style.name == "Heading 1":
            headings.append(paragraph.text)

            if len(new_block) > 0:
                consolidated = ''.join(new_block)
                text_blocks.append(consolidated)

            new_block = []

        elif paragraph.style.name == "Normal":
            text_to_append = paragraph.text
            for index, link in enumerate(paragraph_links):
                text_to_append = text_to_append.replace(f'|{index}|', link)
            text_to_append = f"<p>\n{text_to_append}\n</p>\n"

            text_to_append = clean_awkward_characters(text_to_append)

            new_block.append(text_to_append)

        elif paragraph.style.name == "Heading 2":
            text_to_append = f'<h2>\n{paragraph.text}\n</h2>\n'

            new_block.append(text_to_append)

    for heading, text in zip(headings, text_blocks):
        file_name = Path("jinja_templates") / f"{heading.lower().replace(' ', '_')}.html"
        with open(file_name, 'w') as out_file:
            out_file.write(text)




[docs]
def clean_awkward_characters(in_text: str) -> str:
    """
    Clean a string of odd characters or otherwise inelegant combinations that cause problems for jinja and/or
    html.

    Parameters
    ----------
    in_text: str
        String to be cleaned

    Returns
    -------
    str
        Cleaned string
    """
    switcheroo = {
        "°": "&deg;",
        "–": "-",
        "CO2": "CO<sub>2</sub>",
        "CH4": "CH<sub>4</sub>",
        "N2O": "N<sub>2</sub>O",
        "km2": "km<sup>2</sup>",
        "mm/yr": "mm.yr<sup>-1</sup>",
        "’": "'",
        '“': '"',
        '”': '"'
    }

    out_text = in_text
    for char in switcheroo:
        out_text = out_text.replace(char, switcheroo[char])

    return out_text



if __name__ == '__main__':
    split_document('word_documents/key_indicators_texts.docx')