I'm going to use GloVe with Deeplearning4j, a Java library.
Please prepare the corpus you want to study in advance. In the case of a Japanese corpus, write it in separate words. When writing in separate words, it may be better to change the verbs to the basic form (original form).
Keep the corpus text file as ** input.txt **. Save the created model as ** model.txt **.
ModelBuild.java
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer;
import org.deeplearning4j.models.glove.Glove;
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor;
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import java.io.*;
public class ModelBuild {
public static void main( String[] args ) throws Exception{
//Read corpus file
System.out.println("Reading data...");
File inputFile = new File("input.txt");
//Read as text data class
SentenceIterator iter = new BasicLineIterator(inputFile);
//Create a tokenizer (word split) class
System.out.println("Create a tokenizer...");
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
//Creating a model
System.out.println("Creating a model...");
Glove glove = new Glove.Builder()
.iterate(iter) //Sentence data class
.tokenizerFactory(t) //Word decomposition class
.alpha(0.75) //Parameters in the exponent of the weighting function
.learningRate(0.1) //Initial learning rate
.epochs(25) //Number of iterations on the training corpus during training
.layerSize(300) //Number of dimensions of vector
.maxMemory(2) //Maximum memory usage
.xMax(100) //Weight function cutoff
.batchSize(1000) //Number of words to learn in one mini-batch
.windowSize(10) //Window size
.shuffle(true)
.symmetric(true)
.build();
//Learning
System.out.println("I'm learning...");
glove.fit();
//Save model
System.out.println("Saving the model...");
WordVectorSerializer.writeWordVectors(glove, "model.txt");
System.out.println("The program is over");
}
}
Evaluation.java
import java.io.File;
import java.io.FileNotFoundException;
import java.io.UnsupportedEncodingException;
import java.util.Collection;
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer;
import org.deeplearning4j.models.embeddings.wordvectors.WordVectors;
public class Evaluation {
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
//Load model file
System.out.println("Loading model file...");
File inputFile = new File(args[0]);
WordVectors vec = WordVectorSerializer.loadTxtVectors(inputFile);
//Display the top 10 similar words for the word (for example, "weather")
System.out.println("Top 10 similar words...");
String word = "weather";
int ranking = 10;
Collection<String> similarTop10 = vec.wordsNearest( word , ranking );
System.out.println( String.format( "Similar word to 「%s」 is %s" , word , similarTop10 ) );
//Show cosine similarity (eg "sunny" and "rain")
System.out.println( "Show cosine similarity..." );
String word1 = "Sunny";
String word2 = "rain";
double similarity = vec.similarity( word1 , word2 );
System.out.println( String.format( "The similarity between 「%s」 and 「%s」 is %f" , word1 , word2 , similarity ) );
}
}
Recommended Posts