This is the third time to aim for automatic sentence generation. This time, we will create a function for sentence generation. The code will be long. Let's do it in order.
Now let's talk about the code. Here is the one to use first.
import re
from janome.tokenizer import Tokenizer
from tqdm import tqdm
from collections import Counter
from collections import defaultdict
import random
t = Tokenizer()
Prepare the text and load it. Also keep the text content clean. This area is as I did in the previous article.
a = open('test.txt', 'r', encoding = "utf-8")
original_text = a.read()
#print(original_text) #View document
first_sentence = '"Description of Python."'
last_sentence = 'The reptile python, which means the English word Python, is used as the mascot and icon in the Python language.'
#Organize text data.
_, text = original_text.split(first_sentence)
text, _ = text.split(last_sentence)
text = first_sentence + text + last_sentence
text = text.replace('!', '。') #!! What? To. Change to. Be careful of full-width and half-width
text = text.replace('?', '。')
text = text.replace('(', '').replace(')', '') #Delete ().
text = text.replace('\r', '').replace('\n', '') #Displayed with line breaks in text data\Delete n
text = re.sub('[、「」?]', '', text)
sentences = text.split('。') #.. Divide sentences into sentences with
print('word count:', len(sentences))
sentences[:10] #Display 10 sentences
Break down sentence by sentence.
start = '__start__' #Sentence start mark
fin = '__fin__' #End of sentence
def get_three_words_list(sentence): #Return a sentence as a set of 3 words
t = Tokenizer()
words = t.tokenize(sentence, wakati=True)
words = [start] + words + [fin]
three_words_list = []
for i in range(len(words) - 2):
three_words_list.append(tuple(words[i:i+3]))
return three_words_list
three_words_list = []
for sentence in tqdm(sentences):
three_words_list += get_three_words_list(sentence)
three_words_count = Counter(three_words_list)
len(three_words_count)
Connect and weight words
#Markov chain
def generate_markov_dict(three_words_count):
markov_dict = {}
for three_words, count in three_words_count.items():
two_words = three_words[:2] #Divided into the first two words and the next word
next_word = three_words[2]
if two_words not in markov_dict: #Generate empty data if it does not exist in the dictionary
markov_dict[two_words] = {'words': [], 'weights': []}
markov_dict[two_words]['words'].append(next_word) #Add the following words and times
markov_dict[two_words]['weights'].append(count)
return markov_dict
markov_dict = generate_markov_dict(three_words_count)
markov_dict
def get_first_words_weights(three_words_count):
first_word_count = defaultdict(int)
for three_words, count in three_words_count.items():
if three_words[0] == start:
next_word = three_words[1]
first_word_count[next_word] += count
words = [] #Words and weights(Number of appearances)List to store
weights = []
for word, count in first_word_count.items():
words.append(word) #Add words and weights to the list
weights.append(count)
return words, weights
get_first_words_weights(three_words_count)
markov_dict = generate_markov_dict(three_words_count)
print(len(markov_dict))
first_words, first_weights = get_first_words_weights(three_words_count)
print(len(first_words))
def get_first_words_weights(three_words_count):
first_word_count = defaultdict(int)
for three_words, count in three_words_count.items():
if three_words[0] == start:
next_word = three_words[1]
first_word_count[next_word] += count
words = [] #Words and weights(Number of appearances)List to store
weights = []
for word, count in first_word_count.items():
words.append(word) #Add words and weights to the list
weights.append(count)
return words, weights
get_first_words_weights(three_words_count)
def get_first_words_weights(three_words_count):
first_word_count = defaultdict(int) #Create a defaultdict with a value of int
for three_words, count in three_words_count.items():
if three_words[0] == start: #Extract only those that start with start
next_word = three_words[1]
first_word_count[next_word] += count #Add the number of appearances
return first_word_count
get_first_words_weights(three_words_count)
def generate_text(fwords, fweights, markov_dict):
first_word = random.choices(fwords, weights=fweights)[0] #Get the first word
generate_words = [start, first_word] #List to store words for sentence generation
while True:
pair = tuple(generate_words[-2:]) #Get the last two words
words = markov_dict[pair]['words'] #Get a list of the following words and weights
weights = markov_dict[pair]['weights']
next_word = random.choices(words, weights=weights)[0] #Get the next word
if next_word == fin: #Exit the loop when the sentence ends
break
generate_words.append(next_word)
return ''.join(generate_words[1:]) #Create sentences from words
Start generation!
for l in range(3):
sentence = generate_text(first_words, first_weights, markov_dict)
print(sentence)
The result is this.
(゚ Д ゚) The original sic has come out ... Absolutely the amount of original text is small. Attention) When I tried to execute it, the specifications of the PC were insufficient or it was not generated properly. The reason is unknown. It takes time for a large amount of text.
The text is now automatically generated. This is the end of "Aiming for automatic sentence generation". There are some improvements, but the original amount of text and features are still insufficient. For this reason there are many that inevitably finished the sentence of sic. So it's a pity personally that the text in this example isn't interesting. I will write an article again if I can fix it a little.
This code is based on the book and its sample code. (I forgot the name of the book.) Osamu Dazai. I will also post the text that was created based on the disqualification of human beings.