Outils pour utilisateurs

Outils du site


Panneau latéral

public:stanford_segmenter_and_tagger

Intégration expérimentale du segmenteur et tagger de Stanford

Sites

http://nlp.stanford.edu/software/segmenter.shtml#Download

http://nlp.stanford.edu/downloads/tagger.shtml

packages utiles :

  • edu.stanford.nlp.international.arabic.pipeline
  • edu.stanford.nlp.international.arabic.process

Pour l'instant

2 Macros :

  • StanfordSegmenter : découpe en mot l'arabe et le chinois les textes d'un dossier
  • StanfordTagger : ettiquette les texte d'un dossier et les converti au format XML/w

StanfordSegmenter

Il faut copier :

  • seg.jar dans $TXMHOME/scripts/lib
  • StanfordSegmenterMacro.groovy dans $TXMHOME/scripts/macro

Lien de download des modèles (250Mo) : http://nlp.stanford.edu/software/stanford-segmenter-2014-01-04.zip

Paramètres de la macro :

  • dossier d'entrée
  • dossier de sortie
  • fichier modèle (.ser.gz dans le dossier “data” de l'archive stanford-segmenter-2014-01-04.zip)
  • extension des fichiers à traiter

StanfordTagger

Il faut copier

  • stanford-postagger.jar dans $TXMHOME/scripts/lib
  • StanfordTaggerMacro.groovy dans $TXMHOME/scripts/macro

Lien de download des modèles (110Mo) : http://nlp.stanford.edu/downloads/stanford-postagger-full-2014-01-04.zip

Paramètres de la macro :

  • dossier d'entrée
  • dossier de sortie
  • fichier modèle (.tagger dans le dossier “models” de l'archive stanford-postagger-full-2014-01-04.zip)
  • extension des fichiers à traiter
StanfordSegmenterMacro.groovy

// STANDARD DECLARATIONS

import org.kohsuke.args4j.*
import groovy.transform.Field
import org.txm.rcpapplication.swt.widget.parameters.*
import edu.stanford.nlp.international.arabic.process.*
import edu.stanford.nlp.international.arabic.*

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Serializable;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.WordSegmenter;
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.concurrent.MulticoreWrapper;
import edu.stanford.nlp.util.concurrent.ThreadsafeProcessor;

@Field @Option(name="inputDir",usage="input directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe")
File inputDir
@Field @Option(name="outputDir",usage="output directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe/out-s")
File outputDir

@Field @Option(name="model",usage="'.tagger' model file", widget="File", required=false, def="data/arabic-segmenter-atbtrain.ser.gz")
File model
@Field @Option(name="extension",usage="Regexp de l'extension des fichiers à modifier", widget="String", required=true, def='\\.txt')
String extension = "\\.txt"

// Open the parameters input dialog box
if (!ParametersDialog.open(this)) return;

outputDir.mkdir()
if (!outputDir.exists()) {
	println("Could not create $outputDir")
	return;
}

// END OF PARAMETERS
//File inputFile = new File("/home/mdecorde/xml/testarabe/test.txt")
inputDir.eachFileMatch(~/.*$extension/) { inputFile ->
println "Processing file: $inputFile"
File outputFile = new File(outputDir, inputFile.getName())
//File model = new File("/home/mdecorde/LIBRAIRIES/stanford-segmenter-2014-01-04/data/arabic-segmenter-atbtrain.ser.gz")
Properties options = new Properties();
//options.put("prefixMarker", "#")
//options.put("suffixMarker", "#")
options.put("loadClassifier", model.getAbsolutePath())
ArabicSegmenter segmenter = new ArabicSegmenter(options);
segmenter.flags.inputEncoding = "UTF-8";
segmenter.loadSegmenter(segmenter.flags.loadClassifier, options);

def br = inputFile.newReader("UTF-8")
OutputStreamWriter out = new OutputStreamWriter( new FileOutputStream(outputFile), "UTF-8");
PrintWriter pw = new PrintWriter(out, true);
double charsPerSec = decode(segmenter, br, pw, 5);
IOUtils.closeIgnoringExceptions(br);
System.err.printf("Done! Processed input text at %.2f input characters/second%n", charsPerSec);

}

/**
   * Segment input and write to output stream.
   *
   * @param segmenter
   * @param br
   * @param pwOut
   * @param nThreads
   * @return input characters processed per second
   */
def decode(ArabicSegmenter segmenter, BufferedReader br,
                               PrintWriter pwOut, int nThreads) {
    assert nThreads > 0;
    long nChars = 0;
    final long startTime = System.nanoTime();
    if (nThreads > 1) {
      MulticoreWrapper<String,String> wrapper = new MulticoreWrapper<String,String>(nThreads, segmenter);
      try {
        for (String line; (line = br.readLine()) != null;) {
          nChars += line.length();
          wrapper.put(line);
          while (wrapper.peek()) {
            pwOut.println(wrapper.poll());
          }
        }

        wrapper.join();
        while (wrapper.peek()) {
          pwOut.println(wrapper.poll());
        }

      } catch (IOException e) {
        e.printStackTrace();
      }

    } else {
      nChars = segmenter.segment(br, pwOut);
    }
    long duration = System.nanoTime() - startTime;
    double charsPerSec = (double) nChars / (duration / 1000000000.0);
    return charsPerSec;
  }


StanfordTaggerMacro.groovy

import edu.stanford.nlp.ling.*
import edu.stanford.nlp.process.*
import edu.stanford.nlp.tagger.maxent.*;

// STANDARD DECLARATIONS

import org.kohsuke.args4j.*
import groovy.transform.Field
import org.txm.rcpapplication.swt.widget.parameters.*

@Field @Option(name="inputDir",usage="input directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe/out-s")
File inputDir
@Field @Option(name="outputDir",usage="output directory", widget="Folder", required=true, def="/home/mdecorde/xml/testarabe/out-t")
File outputDir

@Field @Option(name="model",usage="'.tagger' model file", widget="File", required=false, def="taggers/arabic-train.tagger")
File model
@Field @Option(name="extension",usage="Regexp de l'extension des fichiers à modifier", widget="String", required=true, def='\\.txt')
String extension = "\\.txt"

// Open the parameters input dialog box
if (!ParametersDialog.open(this)) return;
def modelPath = model.getAbsolutePath()

outputDir.mkdir()
if (!outputDir.exists()) {
	println("Could not create $outputDir")
	return;
}

inputDir.eachFileMatch(~/.*$extension/) { inputFile ->
	String name = inputFile.getName()
	int idx = name.indexOf(".")
	if (idx > 0) name = name.substring(0, idx)
	
	new File(outputDir, name+".xml").withWriter("UTF-8") { writer ->
		
		writer.println('<?xml version="1.0" encoding="UTF-8"?>')
		writer.println('<text>')
		
		MaxentTagger tagger = new MaxentTagger(modelPath);
		TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
				"untokenizable=noneKeep");
	
		DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(inputFile.newReader());
		documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
		int n = 0
		
		for (List<HasWord> sentence : documentPreprocessor) {
			writer.println("<s n=\"$n\">")
			n++;
			List<TaggedWord> tSentence = tagger.tagSentence(sentence);
			//println(Sentence.listToString(tSentence, false));
			for (def TaggedWord w : tSentence) {
				def form = w.word();
				def pos = w.tag();
				writer.println("<w pos=\"$pos\">$form</w>")
			}
			writer.println('</s>')
		}
		writer.println('</text>')
	}
}
public/stanford_segmenter_and_tagger.txt · Dernière modification: 2014/03/13 19:00 par slh@ens-lyon.fr