/** * LdaWrapper: Convenient entry class (facade) for the Latent Dirichlet Allocation. * * useful VM parameters: * -ea : enable assertions (you never know...) * -Xms256m -Xmx512m: increase use of ram. 32 bit Windows can reach maximum Xmx values of 2gb. * -Dtorel.burnin=100: configure burn-in period (iterations to throw away before averaging) * -Dtorel.samplelag=20: configure sample-lag (number of itereations between two samples used in averaging) * *
* // create the documents
TreeMap pubid2doc = new TreeMap();
// the number refers to a unique identifyer
pubid2doc.put(1, new Document(1, "titel 1","bla bla bla"));
pubid2doc.put(200, new Document(200, "titel 200","blubb blubb
blubb"));
pubid2doc.put(3, new Document(3, "titel 3","blubb bla bla
ignored"));
// Note that words that only occur in a single document (such as
"ignored") are removed.
// configure LDA
// 2 topics
// 200 iterations
// to set the burn-in period to 20, run with -Dtorel.burnin=20.
default is 100
// to set the sample lag to 5, run with -Dtorel.samplelag=5.
default is 20
LdaWrapper lda = new LdaWrapper(pubid2doc, 2, 200, 1);
// the calculation is done in here...
lda.doLda();
// one this is finished, we can read the LDA resuls.
// the topic mixture for publication with id 1
Distribution thetaForDoc1 = lda.getThetaByPubId(1);
System.out.println("thetaForDoc1 = " + thetaForDoc1);
// the topic mixture for publication with id 200
Distribution thetaForDoc200 = lda.getThetaByPubId(200);
System.out.println("thetaForDoc200 = " + thetaForDoc200);
// the topic mixture for publication with id 3
Distribution thetaForDoc3 = lda.getThetaByPubId(3);
System.out.println("thetaForDoc3 = " + thetaForDoc3);
// the topic-word distribution for topic 0...
// .. a) with words referenced to unique integers
Distribution phiForTopic0 = lda.getPhis().get(0);
System.out.println("phiForTopic0 = " + phiForTopic0);
// .. b) with a distribution over the words
Distribution phiWordsForTopic0 =
lda.getVocdoc().phiToWords(phiForTopic0);
System.out.println("phiWordsForTopic0 = " + phiWordsForTopic0);
// same for topic 1.
Distribution phiForTopic1 = lda.getPhis().get(1);
System.out.println("phiForTopic1 = " + phiForTopic1);
Distribution phiWordsForTopic1 =
lda.getVocdoc().phiToWords(phiForTopic1);
System.out.println("phiWordsForTopic1 = " + phiWordsForTopic1);
*
* During the inference, an estimate of when the final iteration is
reached is reported every 100 iterations.
*
* In addition, the process can be stopped by creating a special
filename (xxx.kill) in the working directory.
* If this is the case, the doGibbs-call will return as early as
possible. The availabiliy of the killfill is checked every 100 iterations.
*
*/