unquote url

This commit is contained in:
OneNewDev 2022-05-28 15:34:41 +02:00
parent 737634e740
commit 819d6d51b9

View file

@ -2,6 +2,7 @@ import json
import re
from dataclasses import dataclass
from html.parser import HTMLParser
from urllib.parse import unquote_plus
from pathlib import Path
from exclude import EXCLUDED_WORDS
@ -33,6 +34,7 @@ class FileScanner(HTMLParser):
words_with_usage = {}
words = []
for text in self.texte:
text = unquote_plus(text)
words += re.split(r'[ /\-_#\n.?=]', text)
print(f'\nFile {self.file.parent.name} contains {len(words)} words')
title_words = set(self.file.parent.name.split('-'))