add excluded words

remove url stuff from excluded words
add excluded words
2022-06-06 19:12:11 +02:00 · 2022-06-06 17:31:27 +02:00 · 2022-06-06 17:30:42 +02:00 · 2022-06-06 17:29:22 +02:00 · 2022-06-06 15:36:36 +02:00 · 2022-06-06 12:17:55 +02:00
2 changed files with 63 additions and 16 deletions
--- a/exclude.py
+++ b/exclude.py
@ -10,21 +10,28 @@ EXCLUDED_WORDS = {
    "aus",
    "bei",
    "bis",
+    "chf",
    "dabei",
    "dafür",
    "damit",
+    "dank",
    "das",
    "dass",
    "davon",
+    "deine",
+    "deiner",
    "dem",
    "den",
    "der",
    "des",
    "die",
    "diese",
+    "diesem",
    "dir",
+    "doch",
    "dort",
    "du",
+    "eigenen",
    "ein",
    "ein",
    "eine",
@ -37,13 +44,17 @@ EXCLUDED_WORDS = {
    "es",
    "etwas",
    "euch",
+    "euro",
    "für",
    "gibt",
    "haben",
    "hat",
+    "heute",
    "hier",
+    "hinzu",
    "ich",
    "ihr",
+    "ihre",
    "im",
    "immer",
    "in",
@ -51,13 +62,16 @@ EXCLUDED_WORDS = {
    "ist",
    "jetzt",
    "kann",
+    "konnte",
    "man",
    "mehr",
    "mein",
    "meine",
+    "meiner",
    "mich",
    "mir",
    "mit",
+    "morgen",
    "nach",
    "nicht",
    "noch",
@ -73,37 +87,47 @@ EXCLUDED_WORDS = {
    "sind",
    "um",
    "und",
+    "uns",
+    "unter",
+    "viel",
    "viele",
    "von",
    "vor",
    "war",
+    "was",
+    "wenig",
+    "weniger",
    "wenn",
+    "wer",
    "werden",
    "wie",
+    "wieso",
    "wir",
    "wird",
+    "wo",
+    "wurde",
+    "wurden",
    "zu",
    "zum",
    "zur",
    "über",
    # Englische wörter
-    "and",
    "about",
+    "and",
+    "default",
+    "i",
    "in",
    "more",
    "much",
+    "no",
    "of",
    "or",
+    "that",
+    "the",
    "this",
+    "to",
+    "will",
+    "yes",
+    "you",
    "your",
-    # URL Bestandteile
-    "https",
-    "http",
-    "www",
-    "com",
-    "de",
-    "org",
-    "net",
-    "it",
-    "ch",
 }
--- a/tagger.py
+++ b/tagger.py
@ -13,9 +13,11 @@ SOURCE_FILENAME = 'index.txt'
 OUTPUT_FILE = 'tags.json'
 TAGS_PER_ARTICLE = 5
 JSON_INDENT = 2
+EXCLUDED_HTML_TAGS = {'code'}

 # Wegen Performance vordefinierte Variablen
 _UPPER_CHECK = re.compile(r'[A-Z]')
+_LINK_PATTERN = re.compile(r'https?://\S+')


@dataclass
@ -29,6 +31,8 @@ class FileScanner(HTMLParser):
        super().__init__()
        self.file = file
        self.texte = []
+        self.links = []
+        self._current_html_tag = None
    
    def scan_file(self):
        # Datei einlesen
@ -39,10 +43,8 @@ class FileScanner(HTMLParser):
        words_with_usage = {}
        words = []
        for text in self.texte:
-            # Eventuelle URL-codierte Zeichen in die eigentliche Zeichen umwandeln. (z.B. %2F -> /)
-            text = unquote_plus(text)
            # Textteile in einzelne Wörter aufteilen
-            words += re.split(r'[ /\-_#\n.?=]', text)
+            words += re.split(r'[ \n/]', text)
        # Die Anzahl, der Wörter in der aktuellen Datei, auf der Konsole ausgeben
        title = self.file.parent.name
        print(f'\nFile {title} contains {len(words)} words')
@ -50,7 +52,7 @@ class FileScanner(HTMLParser):
        title_words = set(title.split('-'))
        for word in words:
            # Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen.
-            tag_name = word.strip(".,:;!\"'<>()")
+            tag_name = word.strip(".,:;!?\"'()«»")
            # Leere Wörter ignorieren
            if not tag_name:
                continue
@ -82,21 +84,42 @@ class FileScanner(HTMLParser):
                words_with_usage[word] = Tag(name=tag_name, score=score)
            else:
                words_with_usage[word].score += score
+        
+        link_words = []
+        for link in self.links:
+            # Eventuelle URL-codierte Zeichen in die eigentlichen Zeichen umwandeln. (z.B. %2F -> /)
+            link = unquote_plus(link)
+            # Link-Teile in einzelne Wörter aufteilen
+            words += re.split(r'[/\-_#.?&=]', link)
+        for link_word in link_words:
+            link_word = link_word.lower()
+            if link_word in words_with_usage:
+                words_with_usage[link_word] += 10
+        
        # Die Wörter nach ihrer Bewertung sortieren
        return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)
    
    def handle_starttag(self, tag, attrs):
+        self._current_html_tag = tag
        # Die Links, die in den 'href' Attributen eines <a> HTML-Elements stehen, mit einbeziehen.
        if tag != "a":
            return
        for attr_name, attr_value in attrs:
            if attr_name == "href":
-                self.texte.append(attr_value)
+                self.links.append(attr_value)
                break
    
    def handle_data(self, data):
        # Den Text innerhalb eines HTML-Elements mit einbeziehen.
+        if self._current_html_tag in EXCLUDED_HTML_TAGS:
+            return
+        
+        data = _LINK_PATTERN.sub(self._link_result, data)
        self.texte.append(data)
+    
+    def _link_result(self, link_match):
+        self.links.append(link_match.group(0))
+        return ''


 def display_tags(tags, min_score):
Author	SHA1	Message	Date
OneNewDev	88717a1371	add excluded words	2022-06-06 19:12:11 +02:00
OneNewDev	e8ac8b49f4	remove url stuff from excluded words	2022-06-06 17:31:27 +02:00
OneNewDev	e62f1f2cf9	add excluded words	2022-06-06 17:30:42 +02:00
OneNewDev	20666e8e4c	change split and strip characters	2022-06-06 17:29:22 +02:00
OneNewDev	9eda3b29f8	use links to only increase score and don't add new words	2022-06-06 15:36:36 +02:00
OneNewDev	710faa4611	add two letters to strip	2022-06-06 12:17:55 +02:00
OneNewDev	0c6c621919	exclude code blocks	2022-06-06 12:17:02 +02:00