import inspect
import os
import sys
from random import choice
from typing import List
__author__ = "GLNB"
__copyright__ = "GLNB"
__license__ = "mit"
try:
from .dictionaries import invisible_chars, dict_latin
except ImportError:
from dictionaries import invisible_chars, dict_latin
__location__ = os.path.join(
os.getcwd(), os.path.dirname(inspect.getfile(inspect.currentframe()))
)
[docs]class Scrambler:
# This is done by parsing the Unicode list of confusable characters.
"""
.. code:: python
>>> from text_scrambler import Scrambler
>>> scr = Scrambler()
>>> text = "This is an example"
>>> text_1 = scr.scramble(text, level=1)
>>> #############
>>> # adding only zwj/zwnj characters
>>> print(text, text_1, sep="\\n")
This is an example
This is an example
>>> assert text != text_1
>>> print(len(text), len(text_1))
18 35
>>> # though the texts look similar, the second one has more characters
>>> #############
>>> text_2 = scr.scramble(text, level=2)
>>> # replacing some latin letters by their cyrillic/greek equivalent
>>> print(text_2)
Тhiѕ iѕ an ехаmple
>>> for char, char_2 in zip(text, text_2):
... if char != char_2:
... print(char, char_2)
...
T Т
s ѕ
s ѕ
e е
x х
a а
>>> #############
>>> text_3 = scr.scramble(text, level=3)
>>> # adding zwj/zwnj characters and replacing latin letters
>>> print(text_3)
Thіs iѕ аn eхаmple
>>> print(text, text_3, sep="\\n")
This is an example
Thіs iѕ аn eхаmple
>>> assert text_3 != text
>>> #############
>>> text_4 = scr.scramble(text, level=4)
>>> # replacing all characters by any unicode looking like character
>>> print(text_4)
⊤𝒽𝐢𝘴 𝘪𝙨 𝞪ռ 𝙚⨯𝚊mρ𝟙ҽ
>>> #
>>> # generating several versions
>>> versions = scr.generate(text, 10, level=4)
>>> for txt in versions:
... print(txt)
...
𝕋𝗵𝕚𝔰 𝙞ѕ ɑ𝗇 ꬲ𝗑𝒂m𝛠Ⲓ𝚎
𝔗һ𑣃ƽ ˛ꜱ 𝛼𝐧 𝐞𝖝𝛼m𝜌𝟏ℯ
Th𝓲𝔰 ⅈ𝔰 αn ꬲ⤬αm⍴𞸀e
𝗧𝗵i𝑠 i𝖘 ⍺𝘯 𝗲𝔁аm𝘱𝙸𝔢
⊤𝚑𝑖s ɪ𝚜 𝜶𝑛 𝖾𝘅𝒶m𝛒𝑙𝓮
𝘛h𝙞ꮪ ⅈ𝗌 𝗮𝐧 ꬲᕽ𝓪m𝜌⏽𝓮
𝙏𝕙і𝓈 ıꜱ 𝔞𝕟 𝗲𝕩𝛂mр𐌉𝚎
𝕿Ꮒℹ𝐬 𝗶𝗌 𝛼𝔫 𝗲𝐱𝓪m𝞎𝙡𝖊
⟙h𝜾ꮪ i𝘴 𝝰𝒏 𝙚ᕽ𝗮m𝗽𝗜𝗲
𝖳հ𝒊s 𝕚𝙨 𝖆𝑛 𝘦𝔁аm𝜌𝐈𝗲
>>> versions = scr.generate(text, 1000, level=1)
>>> assert len(versions) == len(set(versions))
>>> # all unique
"""
def __init__(
self,
confusables_file=os.path.join(
__location__, "txt_files", "confusablesSummary.txt"
),
):
# The confusables can be found at:
# https://www.unicode.org/Public/security/13.0.0/confusables.txt
self.confusables_file = confusables_file
self.invisible_chars = invisible_chars
self.dict_latin = dict_latin
self._parse_unicode_file()
def __str__(self):
return self.scramble("<__main__.Scrambler object>", level=4)
__repr__ = __str__
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, exc_traceback):
if exc_type:
print(f"exc_type: {exc_type}", file=sys.stderr)
print(f"exc_value: {exc_value}", file=sys.stderr)
print(f"exc_traceback: {exc_traceback}", file=sys.stderr)
def _parse_unicode_file(self) -> dict:
"""return a dict of the unicode confusable given
the self.confusables_file"""
self.unicode_dict = {}
file = open(self.confusables_file, encoding="utf-8")
ls_lines_confusable = []
for _ in range(32):
file.readline()
for line in file:
if line.startswith("#"):
ls_lines_confusable.append(line[:-1]) # not taking the \n
file.close()
ls_lines_confusable = ls_lines_confusable[
:-1
] # not taking the last line (total)
for line in ls_lines_confusable:
_, char, *ls_chars = line.split("\t")
if len(char) > 1:
continue
self.unicode_dict[char] = ls_chars
[docs] def scramble(self, text: str, level: int = 1) -> str:
"""return the text scrambled
:param text: the text to scramble
:type text: str
:param level: default to 1
:type level: int, optional
**level**:
1: insert non printable characters within the text
2: replace some latin letters to their Greek or Cyrillic equivalent
3: insert non printable characters and change the some latin letters to their Greek or Cyrillic equivalent
4: insert non printable chraracters change all possible letter to a randomly picked unicode letter equivalent
:return: the scrambled string
:rtype: str
"""
if level not in range(1, 5):
raise ValueError(f"level {level} not implemented")
new_text = ""
if level == 1:
for char in text:
new_text += char + choice(self.invisible_chars)
elif level == 2:
for char in text:
new_text += choice(self.dict_latin.get(char, []) + [char])
new_text += " "
elif level == 3:
for char in text:
new_text += choice(self.dict_latin.get(char, []) + [char]) + choice(
self.invisible_chars
)
elif level == 4:
for char in text:
new_text += choice(self.unicode_dict.get(char, []) + [char]) + choice(
self.invisible_chars
)
else:
raise ValueError(f"level '{level}' not implemented")
return new_text[:-1]
[docs] def generate(self, text: str, n: int = 1000, level: int = 3) -> List[str]:
"""return a list containing n versions of the text jammed
:param text: the text to be scrambled
:type text: str
:param n: the number of time the text should be scrambled, defaults to 1000
:type n: int, optional
:param level: the level of the scrambling, defaults to 3
:type level: int, optional
:return: a list of scrambled texts, all differents
:rtype: List[str]
.. code:: python
>>> from text_scrambler import Scrambler
>>> scr = Scrambler()
>>> text = "A cranial nerve nucleus is a collection of neurons in the brain stem that is associated with one or more of the cranial nerves."
>>> texts = scr.generate(text, 1000, level=1)
>>> assert texts[0] != text
>>> for scrambled_text in texts:
... assert text != scrambled_text
...
>>> print(texts[0])
A cranial nerve nucleus is a collection of neurons in the brain stem that is associated with one or more of the cranial nerves.
>>> # different from the original text
"""
ls_new_text = []
num_generated = 0
while True:
new_text = self.scramble(text, level=level)
if new_text not in ls_new_text:
ls_new_text.append(new_text)
num_generated += 1
if num_generated == n:
break
return ls_new_text