/** * LdaWrapper: Convenient entry class (facade) for the Latent Dirichlet Allocation. * * useful VM parameters: * -ea : enable assertions (you never know...) * -Xms256m -Xmx512m: increase use of ram. 32 bit Windows can reach maximum Xmx values of 2gb. * -Dtorel.burnin=100: configure burn-in period (iterations to throw away before averaging) * -Dtorel.samplelag=20: configure sample-lag (number of itereations between two samples used in averaging) * *

Usage

*
 *      // create the documents
        TreeMap pubid2doc = new TreeMap();
        // the number refers to a unique identifyer
        pubid2doc.put(1, new Document(1, "titel 1","bla bla bla"));
        pubid2doc.put(200, new Document(200, "titel 200","blubb blubb
blubb"));
        pubid2doc.put(3, new Document(3, "titel 3","blubb bla bla
ignored"));
        // Note that words that only occur in a single document (such as
"ignored") are removed.

        // configure LDA
        // 2 topics
        // 200 iterations
        // to set the burn-in period to 20, run with -Dtorel.burnin=20.
default is 100
        // to set the sample lag to 5, run with -Dtorel.samplelag=5.
default is 20
        LdaWrapper lda = new LdaWrapper(pubid2doc, 2, 200, 1);
        // the calculation is done in here...
        lda.doLda();
        // one this is finished, we can read the LDA resuls.

        // the topic mixture for publication with id 1
        Distribution thetaForDoc1 = lda.getThetaByPubId(1);
        System.out.println("thetaForDoc1 = " + thetaForDoc1);
        // the topic mixture for publication with id 200
        Distribution thetaForDoc200 = lda.getThetaByPubId(200);
        System.out.println("thetaForDoc200 = " + thetaForDoc200);
        // the topic mixture for publication with id 3
        Distribution thetaForDoc3 = lda.getThetaByPubId(3);
        System.out.println("thetaForDoc3 = " + thetaForDoc3);

        // the topic-word distribution for topic 0...
        // .. a) with words referenced to unique integers
        Distribution phiForTopic0 = lda.getPhis().get(0);
        System.out.println("phiForTopic0 = " + phiForTopic0);
        // .. b) with a distribution over the words
        Distribution phiWordsForTopic0 =
lda.getVocdoc().phiToWords(phiForTopic0);
        System.out.println("phiWordsForTopic0 = " + phiWordsForTopic0);

        // same for topic 1.
        Distribution phiForTopic1 = lda.getPhis().get(1);
        System.out.println("phiForTopic1 = " + phiForTopic1);
        Distribution phiWordsForTopic1 =
lda.getVocdoc().phiToWords(phiForTopic1);
        System.out.println("phiWordsForTopic1 = " + phiWordsForTopic1);
  
* * During the inference, an estimate of when the final iteration is reached is reported every 100 iterations. * * In addition, the process can be stopped by creating a special filename (xxx.kill) in the working directory. * If this is the case, the doGibbs-call will return as early as possible. The availabiliy of the killfill is checked every 100 iterations. * */