Tagger/tagger.py

import json
import re
from dataclasses import dataclass
from html.parser import HTMLParser
from urllib.parse import unquote_plus
from pathlib import Path

from exclude import EXCLUDED_WORDS

SOURCE_DIR = Path('data')
OUTPUT_FILE = 'tags.json'
TAGS_PER_ARTICLE = 5
JSON_INDENT = 2

_UPPER_CHECK = re.compile(r'[A-Z]')


@dataclass
class Tag:
    name: str
    score: int


class FileScanner(HTMLParser):
    def __init__(self, file: Path):
        super().__init__()
        self.file = file
        self.texte = []
    
    def scan_file(self):
        # Datei einlesen
        content = read_file(self.file)
        # HTMLParser aufrufen um HTML-Syntax-Elemente zu entfernen.
        self.feed(content)
        
        words_with_usage = {}
        words = []
        for text in self.texte:
            # Eventuelle URL-codierte Zeichen in die eigentliche Zeichen umwandeln. (z.B. %2F -> /)
            text = unquote_plus(text)
            # Textteile in einzelne Wörter aufteilen
            words += re.split(r'[ /\-_#\n.?=]', text)
        print(f'\nFile {self.file.parent.name} contains {len(words)} words')
        # Titel in einzelne Wörter aufteilen
        title_words = set(self.file.parent.name.split('-'))
        for word in words:
            # Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen.
            tag_name = word.strip(".,:;!\"'<>()")
            # Leere Wörter ignorieren
            if not tag_name:
                continue
            # Alle Buchstaben verkleinern, aber gleichzeitig originales Wort merken
            word = tag_name.lower()
            # Standard Bewertung für jedes Wort ist 10
            score = 10
            # Wörter, die in der Liste der ausgeschlossenen Wörter stehen, ignorieren
            if word in EXCLUDED_WORDS:
                continue
            # Wörter, die nur aus Zahlen bestehen, ignorieren
            if word.isdigit():
                continue
            # Die Bewertung von Wörtern, die im Titel vorkommen, deutlich verbessern.
            if word in title_words:
                score *= 4
            # Die Bewertung von Wörtern, die kürzer oder gleich lang sind als 3 Buchstaben,
            # entsprechend der Länge des Wortes verringern.
            word_length = len(word)
            if word_length <= 3:
                score = int(score * word_length / 4)
            # Die Anzahl der Großbuchstaben in dem originalen Wort zählen ...
            upper_letters_count = len(_UPPER_CHECK.findall(tag_name))
            # ... und die Bewertung entsprechen der Anzahl verbessern.
            score += upper_letters_count * 5
            # Die Bewertung für das Wort speichern.
            # Wenn das Wort bereits eine Bewertung besitzt werden die beiden Bewertungen zusammen gerechnet.
            if word not in words_with_usage:
                words_with_usage[word] = Tag(name=tag_name, score=score)
            else:
                words_with_usage[word].score += score
        # Die Wörter nach ihrer Bewertung sortieren
        return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)
    
    def handle_starttag(self, tag, attrs):
        # Die Links, die in den 'href' Attributen eines <a> HTML-Elements stehen, mit einbeziehen.
        if tag != "a":
            return
        for attr_name, attr_value in attrs:
            if attr_name == "href":
                self.texte.append(attr_value)
                break
    
    def handle_data(self, data):
        # Den Text innerhalb eines HTML-Elements mit einbeziehen.
        self.texte.append(data)


def display_tags(tags, min_score):
    # Die Ergebnisse auf der Konsole ausgeben.
    for tag in tags:
        if tag.score <= min_score:
            continue
        print(f"Score: {tag.score:>3} Word: {tag.name}")


class CustomJsonEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Tag):
            return obj.name
        return super().default(obj)


def write_tags(tags):
    # Die Ergebnisse in JSON umwandeln.
    content = json.dumps(tags, indent=JSON_INDENT, cls=CustomJsonEncoder)
    # Das JSON in eine Datei schreiben.
    with open(OUTPUT_FILE, 'w') as file:
        file.write(content)


def read_file(file: Path) -> str:
    # Eine Datei einlesen
    with open(file, 'r') as file:
        return file.read()


def main():
    final_tags = {}
    for file in SOURCE_DIR.glob('**/index.txt'):  # Nach allen index.txt Dateien suchen
        # Die Dateien, deren Ordner mit 'autosave-' beginnen, ignorieren.
        title = file.parent.name
        if title.startswith('autosave-'):
            continue
        # Die Datei analysieren
        scanner = FileScanner(file)
        tags = scanner.scan_file()
        # Die Ergebnisse auf der Konsole ausgeben
        display_tags(tags, min_score=20)
        # Die eingestellte Anzahl an Tags für die Ausgabedatei übernehmen, sofern vorhanden.
        final_tags[title] = tags[:TAGS_PER_ARTICLE] if len(tags) > TAGS_PER_ARTICLE else tags
    # Die Ausgabedatei schreiben
    write_tags(final_tags)


if __name__ == '__main__':
    main()
write output json file 2022-05-28 12:49:10 +00:00			`import json`
init commit 2022-05-05 16:01:10 +00:00			`import re`
commit 2022-05-05 17:30:56 +00:00			`from dataclasses import dataclass`
			`from html.parser import HTMLParser`
unquote url 2022-05-28 13:34:41 +00:00			`from urllib.parse import unquote_plus`
commit 2022-05-05 17:30:56 +00:00			`from pathlib import Path`

			`from exclude import EXCLUDED_WORDS`

write output json file 2022-05-28 12:49:10 +00:00			`SOURCE_DIR = Path('data')`
			`OUTPUT_FILE = 'tags.json'`
			`TAGS_PER_ARTICLE = 5`
			`JSON_INDENT = 2`

add code comments 2022-05-28 17:22:30 +00:00			`_UPPER_CHECK = re.compile(r'[A-Z]')`
init commit 2022-05-05 16:01:10 +00:00

commit 2022-05-05 17:30:56 +00:00			`@dataclass`
			`class Tag:`
			`name: str`
			`score: int`


			`class FileScanner(HTMLParser):`
			`def __init__(self, file: Path):`
			`super().__init__()`
			`self.file = file`
			`self.texte = []`

			`def scan_file(self):`
add code comments 2022-05-28 17:22:30 +00:00			`# Datei einlesen`
commit 2022-05-05 17:30:56 +00:00			`content = read_file(self.file)`
add code comments 2022-05-28 17:22:30 +00:00			`# HTMLParser aufrufen um HTML-Syntax-Elemente zu entfernen.`
commit 2022-05-05 17:30:56 +00:00			`self.feed(content)`

			`words_with_usage = {}`
			`words = []`
			`for text in self.texte:`
add code comments 2022-05-28 17:22:30 +00:00			`# Eventuelle URL-codierte Zeichen in die eigentliche Zeichen umwandeln. (z.B. %2F -> /)`
unquote url 2022-05-28 13:34:41 +00:00			`text = unquote_plus(text)`
add code comments 2022-05-28 17:22:30 +00:00			`# Textteile in einzelne Wörter aufteilen`
split url letters 2022-05-28 13:33:16 +00:00			`words += re.split(r'[ /\-_#\n.?=]', text)`
commit 2022-05-05 17:30:56 +00:00			`print(f'\nFile {self.file.parent.name} contains {len(words)} words')`
add code comments 2022-05-28 17:22:30 +00:00			`# Titel in einzelne Wörter aufteilen`
commit 2022-05-05 17:30:56 +00:00			`title_words = set(self.file.parent.name.split('-'))`
			`for word in words:`
add code comments 2022-05-28 17:22:30 +00:00			`# Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen.`
commit 2022-05-05 17:30:56 +00:00			`tag_name = word.strip(".,:;!\"'<>()")`
add code comments 2022-05-28 17:22:30 +00:00			`# Leere Wörter ignorieren`
minimal performance improvement 2022-05-28 12:50:00 +00:00			`if not tag_name:`
commit 2022-05-05 17:30:56 +00:00			`continue`
add code comments 2022-05-28 17:22:30 +00:00			`# Alle Buchstaben verkleinern, aber gleichzeitig originales Wort merken`
minimal performance improvement 2022-05-28 12:50:00 +00:00			`word = tag_name.lower()`
add code comments 2022-05-28 17:22:30 +00:00			`# Standard Bewertung für jedes Wort ist 10`
commit 2022-05-05 17:30:56 +00:00			`score = 10`
add code comments 2022-05-28 17:22:30 +00:00			`# Wörter, die in der Liste der ausgeschlossenen Wörter stehen, ignorieren`
commit 2022-05-05 17:30:56 +00:00			`if word in EXCLUDED_WORDS:`
ignore excluded words completely 2022-05-28 13:15:55 +00:00			`continue`
add code comments 2022-05-28 17:22:30 +00:00			`# Wörter, die nur aus Zahlen bestehen, ignorieren`
ignore digits 2022-05-28 13:33:29 +00:00			`if word.isdigit():`
			`continue`
add code comments 2022-05-28 17:22:30 +00:00			`# Die Bewertung von Wörtern, die im Titel vorkommen, deutlich verbessern.`
commit 2022-05-05 17:30:56 +00:00			`if word in title_words:`
			`score *= 4`
add code comments 2022-05-28 17:22:30 +00:00			`# Die Bewertung von Wörtern, die kürzer oder gleich lang sind als 3 Buchstaben,`
			`# entsprechend der Länge des Wortes verringern.`
change handling of words with less than 3 characters 2022-05-28 13:19:00 +00:00			`word_length = len(word)`
			`if word_length <= 3:`
			`score = int(score * word_length / 4)`
add code comments 2022-05-28 17:22:30 +00:00			`# Die Anzahl der Großbuchstaben in dem originalen Wort zählen ...`
			`upper_letters_count = len(_UPPER_CHECK.findall(tag_name))`
			`# ... und die Bewertung entsprechen der Anzahl verbessern.`
reverse sorting and change upper letter multiplier 2022-05-07 17:50:36 +00:00			`score += upper_letters_count * 5`
add code comments 2022-05-28 17:22:30 +00:00			`# Die Bewertung für das Wort speichern.`
			`# Wenn das Wort bereits eine Bewertung besitzt werden die beiden Bewertungen zusammen gerechnet.`
commit 2022-05-05 17:30:56 +00:00			`if word not in words_with_usage:`
			`words_with_usage[word] = Tag(name=tag_name, score=score)`
			`else:`
			`words_with_usage[word].score += score`
add code comments 2022-05-28 17:22:30 +00:00			`# Die Wörter nach ihrer Bewertung sortieren`
write output json file 2022-05-28 12:49:10 +00:00			`return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)`
commit 2022-05-05 17:30:56 +00:00
Revert "don't include links anymore" This reverts commit fba3f789bb2bfc46ea8741aca5fcebe838eb8c50. 2022-05-28 13:24:19 +00:00			`def handle_starttag(self, tag, attrs):`
add code comments 2022-05-28 17:22:30 +00:00			`# Die Links, die in den 'href' Attributen eines <a> HTML-Elements stehen, mit einbeziehen.`
Revert "don't include links anymore" This reverts commit fba3f789bb2bfc46ea8741aca5fcebe838eb8c50. 2022-05-28 13:24:19 +00:00			`if tag != "a":`
			`return`
			`for attr_name, attr_value in attrs:`
			`if attr_name == "href":`
			`self.texte.append(attr_value)`
			`break`

commit 2022-05-05 17:30:56 +00:00			`def handle_data(self, data):`
add code comments 2022-05-28 17:22:30 +00:00			`# Den Text innerhalb eines HTML-Elements mit einbeziehen.`
commit 2022-05-05 17:30:56 +00:00			`self.texte.append(data)`
init commit 2022-05-05 16:01:10 +00:00

set minimum score to 20 2022-05-28 12:52:25 +00:00			`def display_tags(tags, min_score):`
add code comments 2022-05-28 17:22:30 +00:00			`# Die Ergebnisse auf der Konsole ausgeben.`
write output json file 2022-05-28 12:49:10 +00:00			`for tag in tags:`
			`if tag.score <= min_score:`
commit 2022-05-05 17:30:56 +00:00			`continue`
			`print(f"Score: {tag.score:>3} Word: {tag.name}")`
init commit 2022-05-05 16:01:10 +00:00

write output json file 2022-05-28 12:49:10 +00:00			`class CustomJsonEncoder(json.JSONEncoder):`
			`def default(self, obj):`
			`if isinstance(obj, Tag):`
			`return obj.name`
			`return super().default(obj)`


			`def write_tags(tags):`
add code comments 2022-05-28 17:22:30 +00:00			`# Die Ergebnisse in JSON umwandeln.`
write output json file 2022-05-28 12:49:10 +00:00			`content = json.dumps(tags, indent=JSON_INDENT, cls=CustomJsonEncoder)`
add code comments 2022-05-28 17:22:30 +00:00			`# Das JSON in eine Datei schreiben.`
write output json file 2022-05-28 12:49:10 +00:00			`with open(OUTPUT_FILE, 'w') as file:`
			`file.write(content)`


init commit 2022-05-05 16:01:10 +00:00			`def read_file(file: Path) -> str:`
add code comments 2022-05-28 17:22:30 +00:00			`# Eine Datei einlesen`
init commit 2022-05-05 16:01:10 +00:00			`with open(file, 'r') as file:`
			`return file.read()`


write output json file 2022-05-28 12:49:10 +00:00			`def main():`
			`final_tags = {}`
add code comments 2022-05-28 17:22:30 +00:00			`for file in SOURCE_DIR.glob('**/index.txt'): # Nach allen index.txt Dateien suchen`
			`# Die Dateien, deren Ordner mit 'autosave-' beginnen, ignorieren.`
write output json file 2022-05-28 12:49:10 +00:00			`title = file.parent.name`
			`if title.startswith('autosave-'):`
ignore autosave articles 2022-05-28 12:08:28 +00:00			`continue`
add code comments 2022-05-28 17:22:30 +00:00			`# Die Datei analysieren`
commit 2022-05-05 17:30:56 +00:00			`scanner = FileScanner(file)`
write output json file 2022-05-28 12:49:10 +00:00			`tags = scanner.scan_file()`
add code comments 2022-05-28 17:22:30 +00:00			`# Die Ergebnisse auf der Konsole ausgeben`
set minimum score to 20 2022-05-28 12:52:25 +00:00			`display_tags(tags, min_score=20)`
add code comments 2022-05-28 17:22:30 +00:00			`# Die eingestellte Anzahl an Tags für die Ausgabedatei übernehmen, sofern vorhanden.`
write output json file 2022-05-28 12:49:10 +00:00			`final_tags[title] = tags[:TAGS_PER_ARTICLE] if len(tags) > TAGS_PER_ARTICLE else tags`
add code comments 2022-05-28 17:22:30 +00:00			`# Die Ausgabedatei schreiben`
			`write_tags(final_tags)`
init commit 2022-05-05 16:01:10 +00:00

			`if __name__ == '__main__':`
			`main()`