From 0c6c621919e967c3501a0b0c2a640925c396c05a Mon Sep 17 00:00:00 2001 From: OneNewDev Date: Mon, 6 Jun 2022 12:17:02 +0200 Subject: [PATCH 1/7] exclude code blocks --- tagger.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tagger.py b/tagger.py index 437d95c..a03a81d 100644 --- a/tagger.py +++ b/tagger.py @@ -13,6 +13,7 @@ SOURCE_FILENAME = 'index.txt' OUTPUT_FILE = 'tags.json' TAGS_PER_ARTICLE = 5 JSON_INDENT = 2 +EXCLUDED_HTML_TAGS = {'code'} # Wegen Performance vordefinierte Variablen _UPPER_CHECK = re.compile(r'[A-Z]') @@ -29,6 +30,7 @@ class FileScanner(HTMLParser): super().__init__() self.file = file self.texte = [] + self._current_html_tag = None def scan_file(self): # Datei einlesen @@ -86,6 +88,7 @@ class FileScanner(HTMLParser): return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True) def handle_starttag(self, tag, attrs): + self._current_html_tag = tag # Die Links, die in den 'href' Attributen eines HTML-Elements stehen, mit einbeziehen. if tag != "a": return @@ -95,8 +98,9 @@ class FileScanner(HTMLParser): break def handle_data(self, data): - # Den Text innerhalb eines HTML-Elements mit einbeziehen. - self.texte.append(data) + if self._current_html_tag not in EXCLUDED_HTML_TAGS: + # Den Text innerhalb eines HTML-Elements mit einbeziehen. + self.texte.append(data) def display_tags(tags, min_score): From 710faa4611536dfefc629b335084f313d39f5230 Mon Sep 17 00:00:00 2001 From: OneNewDev Date: Mon, 6 Jun 2022 12:17:55 +0200 Subject: [PATCH 2/7] add two letters to strip --- tagger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tagger.py b/tagger.py index a03a81d..a0009ed 100644 --- a/tagger.py +++ b/tagger.py @@ -52,7 +52,7 @@ class FileScanner(HTMLParser): title_words = set(title.split('-')) for word in words: # Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen. - tag_name = word.strip(".,:;!\"'<>()") + tag_name = word.strip(".,:;!\"'<>()«»") # Leere Wörter ignorieren if not tag_name: continue From 9eda3b29f83c3b04357dc0e2ccd9771cdc4413b6 Mon Sep 17 00:00:00 2001 From: OneNewDev Date: Mon, 6 Jun 2022 15:36:36 +0200 Subject: [PATCH 3/7] use links to only increase score and don't add new words --- tagger.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/tagger.py b/tagger.py index a0009ed..cbe31f2 100644 --- a/tagger.py +++ b/tagger.py @@ -17,6 +17,7 @@ EXCLUDED_HTML_TAGS = {'code'} # Wegen Performance vordefinierte Variablen _UPPER_CHECK = re.compile(r'[A-Z]') +_LINK_PATTERN = re.compile(r'https?://\S+') @dataclass @@ -30,6 +31,7 @@ class FileScanner(HTMLParser): super().__init__() self.file = file self.texte = [] + self.links = [] self._current_html_tag = None def scan_file(self): @@ -41,10 +43,8 @@ class FileScanner(HTMLParser): words_with_usage = {} words = [] for text in self.texte: - # Eventuelle URL-codierte Zeichen in die eigentliche Zeichen umwandeln. (z.B. %2F -> /) - text = unquote_plus(text) # Textteile in einzelne Wörter aufteilen - words += re.split(r'[ /\-_#\n.?=]', text) + words += re.split(r'[ \n\-_/]', text) # Die Anzahl, der Wörter in der aktuellen Datei, auf der Konsole ausgeben title = self.file.parent.name print(f'\nFile {title} contains {len(words)} words') @@ -52,7 +52,7 @@ class FileScanner(HTMLParser): title_words = set(title.split('-')) for word in words: # Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen. - tag_name = word.strip(".,:;!\"'<>()«»") + tag_name = word.strip(".,:;!\"'()«»") # Leere Wörter ignorieren if not tag_name: continue @@ -84,6 +84,18 @@ class FileScanner(HTMLParser): words_with_usage[word] = Tag(name=tag_name, score=score) else: words_with_usage[word].score += score + + link_words = [] + for link in self.links: + # Eventuelle URL-codierte Zeichen in die eigentlichen Zeichen umwandeln. (z.B. %2F -> /) + link = unquote_plus(link) + # Link-Teile in einzelne Wörter aufteilen + words += re.split(r'[/\-_#.?&=]', link) + for link_word in link_words: + link_word = link_word.lower() + if link_word in words_with_usage: + words_with_usage[link_word] += 10 + # Die Wörter nach ihrer Bewertung sortieren return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True) @@ -94,13 +106,20 @@ class FileScanner(HTMLParser): return for attr_name, attr_value in attrs: if attr_name == "href": - self.texte.append(attr_value) + self.links.append(attr_value) break def handle_data(self, data): - if self._current_html_tag not in EXCLUDED_HTML_TAGS: - # Den Text innerhalb eines HTML-Elements mit einbeziehen. - self.texte.append(data) + # Den Text innerhalb eines HTML-Elements mit einbeziehen. + if self._current_html_tag in EXCLUDED_HTML_TAGS: + return + + data = _LINK_PATTERN.sub(self._link_result, data) + self.texte.append(data) + + def _link_result(self, link_match): + self.links.append(link_match.group(0)) + return '' def display_tags(tags, min_score): From 20666e8e4cbba2a11d1c3096033d1fb65860104d Mon Sep 17 00:00:00 2001 From: OneNewDev Date: Mon, 6 Jun 2022 17:29:22 +0200 Subject: [PATCH 4/7] change split and strip characters --- tagger.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tagger.py b/tagger.py index cbe31f2..83e1ec8 100644 --- a/tagger.py +++ b/tagger.py @@ -44,7 +44,7 @@ class FileScanner(HTMLParser): words = [] for text in self.texte: # Textteile in einzelne Wörter aufteilen - words += re.split(r'[ \n\-_/]', text) + words += re.split(r'[ \n/]', text) # Die Anzahl, der Wörter in der aktuellen Datei, auf der Konsole ausgeben title = self.file.parent.name print(f'\nFile {title} contains {len(words)} words') @@ -52,7 +52,7 @@ class FileScanner(HTMLParser): title_words = set(title.split('-')) for word in words: # Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen. - tag_name = word.strip(".,:;!\"'()«»") + tag_name = word.strip(".,:;!?\"'()«»") # Leere Wörter ignorieren if not tag_name: continue From e62f1f2cf9a35951a3c0926738329cf3f8d1e5ba Mon Sep 17 00:00:00 2001 From: OneNewDev Date: Mon, 6 Jun 2022 17:30:42 +0200 Subject: [PATCH 5/7] add excluded words --- exclude.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/exclude.py b/exclude.py index 438cd39..21e5951 100644 --- a/exclude.py +++ b/exclude.py @@ -10,21 +10,27 @@ EXCLUDED_WORDS = { "aus", "bei", "bis", + "chf", "dabei", "dafür", "damit", + "dank", "das", "dass", "davon", + "deine", + "deiner", "dem", "den", "der", "des", "die", "diese", + "diesem", "dir", "dort", "du", + "eigenen", "ein", "ein", "eine", @@ -37,13 +43,16 @@ EXCLUDED_WORDS = { "es", "etwas", "euch", + "euro", "für", "gibt", "haben", "hat", "hier", + "hinzu", "ich", "ihr", + "ihre", "im", "immer", "in", @@ -51,10 +60,12 @@ EXCLUDED_WORDS = { "ist", "jetzt", "kann", + "konnte", "man", "mehr", "mein", "meine", + "meiner", "mich", "mir", "mit", @@ -73,21 +84,32 @@ EXCLUDED_WORDS = { "sind", "um", "und", + "uns", + "unter", "viele", "von", "vor", "war", + "was", "wenn", + "wer", "werden", "wie", + "wieso", "wir", "wird", + "wo", + "wurde", + "wurden", "zu", "zum", "zur", "über", # Englische wörter "and", + "default", + "yes", + "no", "about", "in", "more", From e8ac8b49f41dc13d5e87ca8d5f4b00e1eba755b3 Mon Sep 17 00:00:00 2001 From: OneNewDev Date: Mon, 6 Jun 2022 17:31:27 +0200 Subject: [PATCH 6/7] remove url stuff from excluded words --- exclude.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/exclude.py b/exclude.py index 21e5951..64f869f 100644 --- a/exclude.py +++ b/exclude.py @@ -118,14 +118,4 @@ EXCLUDED_WORDS = { "or", "this", "your", - # URL Bestandteile - "https", - "http", - "www", - "com", - "de", - "org", - "net", - "it", - "ch", } From 88717a137148384da5c4357c92740cc14d8a4cc7 Mon Sep 17 00:00:00 2001 From: OneNewDev Date: Mon, 6 Jun 2022 19:12:11 +0200 Subject: [PATCH 7/7] add excluded words --- exclude.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/exclude.py b/exclude.py index 64f869f..553c591 100644 --- a/exclude.py +++ b/exclude.py @@ -28,6 +28,7 @@ EXCLUDED_WORDS = { "diese", "diesem", "dir", + "doch", "dort", "du", "eigenen", @@ -48,6 +49,7 @@ EXCLUDED_WORDS = { "gibt", "haben", "hat", + "heute", "hier", "hinzu", "ich", @@ -69,6 +71,7 @@ EXCLUDED_WORDS = { "mich", "mir", "mit", + "morgen", "nach", "nicht", "noch", @@ -86,11 +89,14 @@ EXCLUDED_WORDS = { "und", "uns", "unter", + "viel", "viele", "von", "vor", "war", "was", + "wenig", + "weniger", "wenn", "wer", "werden", @@ -106,16 +112,22 @@ EXCLUDED_WORDS = { "zur", "über", # Englische wörter + "about", "and", "default", - "yes", - "no", - "about", + "i", "in", "more", "much", + "no", "of", "or", + "that", + "the", "this", + "to", + "will", + "yes", + "you", "your", }