unquote url
This commit is contained in:
parent
737634e740
commit
819d6d51b9
|
@ -2,6 +2,7 @@ import json
|
||||||
import re
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
|
from urllib.parse import unquote_plus
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from exclude import EXCLUDED_WORDS
|
from exclude import EXCLUDED_WORDS
|
||||||
|
@ -33,6 +34,7 @@ class FileScanner(HTMLParser):
|
||||||
words_with_usage = {}
|
words_with_usage = {}
|
||||||
words = []
|
words = []
|
||||||
for text in self.texte:
|
for text in self.texte:
|
||||||
|
text = unquote_plus(text)
|
||||||
words += re.split(r'[ /\-_#\n.?=]', text)
|
words += re.split(r'[ /\-_#\n.?=]', text)
|
||||||
print(f'\nFile {self.file.parent.name} contains {len(words)} words')
|
print(f'\nFile {self.file.parent.name} contains {len(words)} words')
|
||||||
title_words = set(self.file.parent.name.split('-'))
|
title_words = set(self.file.parent.name.split('-'))
|
||||||
|
|
Loading…
Reference in a new issue