use links to only increase score and don't add new words

2022-06-06 15:36:36 +02:00 · 2022-06-06 15:36:36 +02:00 · 9eda3b29f8
commit 9eda3b29f8
parent 710faa4611
1 changed files with 27 additions and 8 deletions
--- a/tagger.py
+++ b/tagger.py
@ -17,6 +17,7 @@ EXCLUDED_HTML_TAGS = {'code'}

 # Wegen Performance vordefinierte Variablen
 _UPPER_CHECK = re.compile(r'[A-Z]')
+_LINK_PATTERN = re.compile(r'https?://\S+')


@dataclass
@ -30,6 +31,7 @@ class FileScanner(HTMLParser):
        super().__init__()
        self.file = file
        self.texte = []
+        self.links = []
        self._current_html_tag = None
    
    def scan_file(self):
@ -41,10 +43,8 @@ class FileScanner(HTMLParser):
        words_with_usage = {}
        words = []
        for text in self.texte:
-            # Eventuelle URL-codierte Zeichen in die eigentliche Zeichen umwandeln. (z.B. %2F -> /)
-            text = unquote_plus(text)
            # Textteile in einzelne Wörter aufteilen
-            words += re.split(r'[ /\-_#\n.?=]', text)
+            words += re.split(r'[ \n\-_/]', text)
        # Die Anzahl, der Wörter in der aktuellen Datei, auf der Konsole ausgeben
        title = self.file.parent.name
        print(f'\nFile {title} contains {len(words)} words')
@ -52,7 +52,7 @@ class FileScanner(HTMLParser):
        title_words = set(title.split('-'))
        for word in words:
            # Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen.
-            tag_name = word.strip(".,:;!\"'<>()«»")
+            tag_name = word.strip(".,:;!\"'()«»")
            # Leere Wörter ignorieren
            if not tag_name:
                continue
@ -84,6 +84,18 @@ class FileScanner(HTMLParser):
                words_with_usage[word] = Tag(name=tag_name, score=score)
            else:
                words_with_usage[word].score += score
+        
+        link_words = []
+        for link in self.links:
+            # Eventuelle URL-codierte Zeichen in die eigentlichen Zeichen umwandeln. (z.B. %2F -> /)
+            link = unquote_plus(link)
+            # Link-Teile in einzelne Wörter aufteilen
+            words += re.split(r'[/\-_#.?&=]', link)
+        for link_word in link_words:
+            link_word = link_word.lower()
+            if link_word in words_with_usage:
+                words_with_usage[link_word] += 10
+        
        # Die Wörter nach ihrer Bewertung sortieren
        return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)
    
@ -94,13 +106,20 @@ class FileScanner(HTMLParser):
            return
        for attr_name, attr_value in attrs:
            if attr_name == "href":
-                self.texte.append(attr_value)
+                self.links.append(attr_value)
                break
    
    def handle_data(self, data):
-        if self._current_html_tag not in EXCLUDED_HTML_TAGS:
-            # Den Text innerhalb eines HTML-Elements mit einbeziehen.
-            self.texte.append(data)
+        # Den Text innerhalb eines HTML-Elements mit einbeziehen.
+        if self._current_html_tag in EXCLUDED_HTML_TAGS:
+            return
+        
+        data = _LINK_PATTERN.sub(self._link_result, data)
+        self.texte.append(data)
+    
+    def _link_result(self, link_match):
+        self.links.append(link_match.group(0))
+        return ''


 def display_tags(tags, min_score):