Liste de liens :
Liste de liens :
http://nlp.stanford.edu/software/segmenter.shtml#Download
http://nlp.stanford.edu/downloads/tagger.shtml
packages utiles :
2 Macros :
StanfordSegmenter
Il faut copier :
Lien de download des modèles (250Mo) : http://nlp.stanford.edu/software/stanford-segmenter-2014-01-04.zip
Paramètres de la macro :
StanfordTagger
Il faut copier
Lien de download des modèles (110Mo) : http://nlp.stanford.edu/downloads/stanford-postagger-full-2014-01-04.zip
Paramètres de la macro :
StanfordSegmenterMacro.groovy // STANDARD DECLARATIONS import org.kohsuke.args4j.* import groovy.transform.Field import org.txm.rcpapplication.swt.widget.parameters.* import edu.stanford.nlp.international.arabic.process.* import edu.stanford.nlp.international.arabic.* import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.Serializable; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Properties; import edu.stanford.nlp.ie.crf.CRFClassifier; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.ling.Sentence; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.objectbank.ObjectBank; import edu.stanford.nlp.process.TokenizerFactory; import edu.stanford.nlp.process.WordSegmenter; import edu.stanford.nlp.sequences.DocumentReaderAndWriter; import edu.stanford.nlp.sequences.SeqClassifierFlags; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.PropertiesUtils; import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.concurrent.MulticoreWrapper; import edu.stanford.nlp.util.concurrent.ThreadsafeProcessor; @Field @Option(name="inputDir",usage="input directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe") File inputDir @Field @Option(name="outputDir",usage="output directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe/out-s") File outputDir @Field @Option(name="model",usage="'.tagger' model file", widget="File", required=false, def="data/arabic-segmenter-atbtrain.ser.gz") File model @Field @Option(name="extension",usage="Regexp de l'extension des fichiers à modifier", widget="String", required=true, def='\\.txt') String extension = "\\.txt" // Open the parameters input dialog box if (!ParametersDialog.open(this)) return; outputDir.mkdir() if (!outputDir.exists()) { println("Could not create $outputDir") return; } // END OF PARAMETERS //File inputFile = new File("/home/mdecorde/xml/testarabe/test.txt") inputDir.eachFileMatch(~/.*$extension/) { inputFile -> println "Processing file: $inputFile" File outputFile = new File(outputDir, inputFile.getName()) //File model = new File("/home/mdecorde/LIBRAIRIES/stanford-segmenter-2014-01-04/data/arabic-segmenter-atbtrain.ser.gz") Properties options = new Properties(); //options.put("prefixMarker", "#") //options.put("suffixMarker", "#") options.put("loadClassifier", model.getAbsolutePath()) ArabicSegmenter segmenter = new ArabicSegmenter(options); segmenter.flags.inputEncoding = "UTF-8"; segmenter.loadSegmenter(segmenter.flags.loadClassifier, options); def br = inputFile.newReader("UTF-8") OutputStreamWriter out = new OutputStreamWriter( new FileOutputStream(outputFile), "UTF-8"); PrintWriter pw = new PrintWriter(out, true); double charsPerSec = decode(segmenter, br, pw, 5); IOUtils.closeIgnoringExceptions(br); System.err.printf("Done! Processed input text at %.2f input characters/second%n", charsPerSec); } /** * Segment input and write to output stream. * * @param segmenter * @param br * @param pwOut * @param nThreads * @return input characters processed per second */ def decode(ArabicSegmenter segmenter, BufferedReader br, PrintWriter pwOut, int nThreads) { assert nThreads > 0; long nChars = 0; final long startTime = System.nanoTime(); if (nThreads > 1) { MulticoreWrapper<String,String> wrapper = new MulticoreWrapper<String,String>(nThreads, segmenter); try { for (String line; (line = br.readLine()) != null;) { nChars += line.length(); wrapper.put(line); while (wrapper.peek()) { pwOut.println(wrapper.poll()); } } wrapper.join(); while (wrapper.peek()) { pwOut.println(wrapper.poll()); } } catch (IOException e) { e.printStackTrace(); } } else { nChars = segmenter.segment(br, pwOut); } long duration = System.nanoTime() - startTime; double charsPerSec = (double) nChars / (duration / 1000000000.0); return charsPerSec; } StanfordTaggerMacro.groovy import edu.stanford.nlp.ling.* import edu.stanford.nlp.process.* import edu.stanford.nlp.tagger.maxent.*; // STANDARD DECLARATIONS import org.kohsuke.args4j.* import groovy.transform.Field import org.txm.rcpapplication.swt.widget.parameters.* @Field @Option(name="inputDir",usage="input directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe/out-s") File inputDir @Field @Option(name="outputDir",usage="output directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe/out-t") File outputDir @Field @Option(name="model",usage="'.tagger' model file", widget="File", required=false, def="taggers/arabic-train.tagger") File model @Field @Option(name="extension",usage="Regexp de l'extension des fichiers à modifier", widget="String", required=true, def='\\.txt') String extension = "\\.txt" // Open the parameters input dialog box if (!ParametersDialog.open(this)) return; def modelPath = model.getAbsolutePath() outputDir.mkdir() if (!outputDir.exists()) { println("Could not create $outputDir") return; } inputDir.eachFileMatch(~/.*$extension/) { inputFile -> String name = inputFile.getName() int idx = name.indexOf(".") if (idx > 0) name = name.substring(0, idx) new File(outputDir, name+".xml").withWriter("UTF-8") { writer -> writer.println('<?xml version="1.0" encoding="UTF-8"?>') writer.println('<text>') MaxentTagger tagger = new MaxentTagger(modelPath); TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(inputFile.newReader()); documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory); int n = 0 for (List<HasWord> sentence : documentPreprocessor) { writer.println("<s n=\"$n\">") n++; List<TaggedWord> tSentence = tagger.tagSentence(sentence); //println(Sentence.listToString(tSentence, false)); for (def TaggedWord w : tSentence) { def form = w.word(); def pos = w.tag(); writer.println("<w pos=\"$pos\">$form</w>") } writer.println('</s>') } writer.println('</text>') } }