diff --git a/tagger.py b/tagger.py index 299d084..0299aa1 100644 --- a/tagger.py +++ b/tagger.py @@ -2,6 +2,7 @@ import json import re from dataclasses import dataclass from html.parser import HTMLParser +from urllib.parse import unquote_plus from pathlib import Path from exclude import EXCLUDED_WORDS @@ -33,6 +34,7 @@ class FileScanner(HTMLParser): words_with_usage = {} words = [] for text in self.texte: + text = unquote_plus(text) words += re.split(r'[ /\-_#\n.?=]', text) print(f'\nFile {self.file.parent.name} contains {len(words)} words') title_words = set(self.file.parent.name.split('-'))