Module util_helper.text_preprocessor
Expand source code
import os
import re
def split_by_letters(text, max_letters=1):
# split text into separate characters
chars = list(text)
# split into batches of max_chars_per_batch
batches = ["".join(chars[i:i + max_letters]) for i in range(0, len(chars), max_letters)]
return batches
def split_by_words(text, max_words=1):
# split text into separate words
words = text.split(" ")
# split into batches of max_words_per_batch
batches = [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
return batches
def split_by_lines(text):
# split text into separate lines
lines = text.split("\n")
return lines
def split_by_double_lines(text):
# split text into separate double lines
double_lines = text.split("\n\n")
return double_lines
def split_by_dot(text):
# split text into separate sentences
sentences = text.split(".")
return sentences
def remove_non_letters(string):
print(string)
pattern = r'[^a-zA-Z\s]'
string = re.sub(pattern, '', string)
pattern = r'\s+'
string = re.sub(pattern, ' ', string)
string = string.strip()
return string
def remove_list_formatting(string):
pattern = r'\n[0-9]+\. '
string = re.sub(pattern, '', string)
pattern = r'\n[0-9]+\) '
string = re.sub(pattern, '', string)
pattern = r'\s[0-9]+\. '
string = re.sub(pattern, '', string)
pattern = r'\s[0-9]+\) '
string = re.sub(pattern, '', string)
pattern = r'\s{2,}'
string = re.sub(pattern, ' ', string)
string = string.strip()
return string
Functions
def remove_list_formatting(string)
-
Expand source code
def remove_list_formatting(string): pattern = r'\n[0-9]+\. ' string = re.sub(pattern, '', string) pattern = r'\n[0-9]+\) ' string = re.sub(pattern, '', string) pattern = r'\s[0-9]+\. ' string = re.sub(pattern, '', string) pattern = r'\s[0-9]+\) ' string = re.sub(pattern, '', string) pattern = r'\s{2,}' string = re.sub(pattern, ' ', string) string = string.strip() return string
def remove_non_letters(string)
-
Expand source code
def remove_non_letters(string): print(string) pattern = r'[^a-zA-Z\s]' string = re.sub(pattern, '', string) pattern = r'\s+' string = re.sub(pattern, ' ', string) string = string.strip() return string
def split_by_dot(text)
-
Expand source code
def split_by_dot(text): # split text into separate sentences sentences = text.split(".") return sentences
def split_by_double_lines(text)
-
Expand source code
def split_by_double_lines(text): # split text into separate double lines double_lines = text.split("\n\n") return double_lines
def split_by_letters(text, max_letters=1)
-
Expand source code
def split_by_letters(text, max_letters=1): # split text into separate characters chars = list(text) # split into batches of max_chars_per_batch batches = ["".join(chars[i:i + max_letters]) for i in range(0, len(chars), max_letters)] return batches
def split_by_lines(text)
-
Expand source code
def split_by_lines(text): # split text into separate lines lines = text.split("\n") return lines
def split_by_words(text, max_words=1)
-
Expand source code
def split_by_words(text, max_words=1): # split text into separate words words = text.split(" ") # split into batches of max_words_per_batch batches = [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)] return batches