From 819d6d51b9998d7ac1fcf834f36f0692ec51a10f Mon Sep 17 00:00:00 2001 From: OneNewDev Date: Sat, 28 May 2022 15:34:41 +0200 Subject: [PATCH] unquote url --- tagger.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tagger.py b/tagger.py index 299d084..0299aa1 100644 --- a/tagger.py +++ b/tagger.py @@ -2,6 +2,7 @@ import json import re from dataclasses import dataclass from html.parser import HTMLParser +from urllib.parse import unquote_plus from pathlib import Path from exclude import EXCLUDED_WORDS @@ -33,6 +34,7 @@ class FileScanner(HTMLParser): words_with_usage = {} words = [] for text in self.texte: + text = unquote_plus(text) words += re.split(r'[ /\-_#\n.?=]', text) print(f'\nFile {self.file.parent.name} contains {len(words)} words') title_words = set(self.file.parent.name.split('-'))