Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion README.textile
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,24 @@ See individual readme files under the scripts directory for how to run.

h2. Why is it called Varaha?

Evidently, Varaha is an avatar of the Hindu god Vishnu, in the form of a Boar.
Evidently, Varaha is an avatar of the Hindu god Vishnu, in the form of a Boar.

h2. How do I tokenize and tag text?

register ../../lib/stanford-postagger-withModel.jar
register ../../target/varaha-1.0-SNAPSHOT.jar

reviews = LOAD 'data/ten.avro' USING AvroStorage;
foo = FOREACH reviews GENERATE business_id, varaha.text.StanfordTokenize(text) AS tagged;
DUMP foo

reviews = LOAD 'data/ten.avro' USING AvroStorage();
reviews = LIMIT reviews 1000;
bar = FOREACH reviews GENERATE business_id, FLATTEN(varaha.text.SentenceTokenize(text)) AS tokenized_sentences;
bar = FOREACH bar GENERATE business_id, varaha.text.StanfordPOSTag(tokenized_sentences) AS tagged;
DUMP bar

reviews = LOAD 'data/ten.avro' USING AvroStorage();
reviews = LIMIT reviews 1000;
bar = FOREACH reviews GENERATE business_id, varaha.text.StanfordPOSTag(varaha.text.StanfordTokenize(text)) AS tokens;
DUMP bar
Binary file added lib/stanford-corenlp-1.3.4.jar
Binary file not shown.
Binary file added lib/stanford-postagger-withModel.jar
Binary file not shown.
Binary file added scripts/pos_tagging/data/ten.avro/._SUCCESS.crc
Binary file not shown.
Binary file not shown.
Empty file.
Binary file not shown.
17 changes: 17 additions & 0 deletions scripts/pos_tagging/test.pig
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
register ../../lib/stanford-postagger-withModel.jar
register ../../target/varaha-1.0-SNAPSHOT.jar

reviews = LOAD 'data/ten.avro' USING AvroStorage;
foo = FOREACH reviews GENERATE business_id, varaha.text.StanfordTokenize(text) AS tokenized;
DUMP foo

reviews = LOAD 'data/ten.avro' USING AvroStorage();
reviews = LIMIT reviews 1000;
bar = FOREACH reviews GENERATE business_id, FLATTEN(varaha.text.SentenceTokenize(text)) AS tokenized_sentences;
bar = FOREACH bar GENERATE business_id, varaha.text.StanfordPOSTag(tokenized_sentences) AS tagged;
DUMP bar

reviews = LOAD 'data/ten.avro' USING AvroStorage();
reviews = LIMIT reviews 1000;
bar = FOREACH reviews GENERATE business_id, varaha.text.StanfordPOSTag(varaha.text.StanfordTokenize(text)) AS tokens;
DUMP bar
11 changes: 11 additions & 0 deletions src/main/java/varaha/text/EntityRecognize.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package varaha.text;

/**
* Created with IntelliJ IDEA.
* User: rjurney
* Date: 12/24/13
* Time: 1:54 PM
* To change this template use File | Settings | File Templates.
*/
public class EntityRecognize {
}
80 changes: 80 additions & 0 deletions src/main/java/varaha/text/NGramTokenize.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package varaha.text;

import java.io.IOException;
import java.io.StringReader;
import java.util.*;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.BagFactory;

import edu.stanford.nlp.process.DocumentPreprocessor;

/**
* TokenizeText uses the Lucene libraries StandardAnalyzer class to tokenize a
* raw text input. A list of the stopwords used is available {@link StopWords}.
* Output is a pig bag containing tokens.
*
* <dt><b>Example:</b></dt>
* <dd><code>
* register varaha.jar;<br/>
* documents = LOAD 'documents' AS (doc_id:chararray, text:chararray);<br/>
* tokenized = FOREACH documents GENERATE doc_id AS doc_id, FLATTEN(StanfordTokenize(text)) AS (token:chararray);
* </code></dd>
* </dl>
*
* @see
* @author Russell Jurney
*
*/
public class NGramTokenizer extends EvalFunc<DataBag> {

private static TupleFactory tupleFactory = TupleFactory.getInstance();
private static BagFactory bagFactory = BagFactory.getInstance();

public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() < 1 || input.isNull(0))
return null;

StringReader textInput = new StringReader(input.get(0).toString());
int n = (Integer)input.get(1);

// Output bag
DataBag bagOfSentences = bagFactory.newDefaultBag();

DocumentPreprocessor dp = new DocumentPreprocessor(textInput);
for (List sentence : dp) {
DataBag sentenceBag = bagFactory.newDefaultBag();
ListIterator<Object> sli = sentence.listIterator();
while(sli.hasNext())
{
String word = sli.next().toString();
Tuple termText = tupleFactory.newTuple(word);
sentenceBag.add(termText);
}
Tuple sentenceTuple = tupleFactory.newTuple(sentenceBag);
bagOfSentences.add(sentenceTuple);
}
return bagOfSentences;
}
}
78 changes: 78 additions & 0 deletions src/main/java/varaha/text/SentenceTokenize.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package varaha.text;

import java.io.IOException;
import java.io.StringReader;
import java.util.*;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.BagFactory;

import edu.stanford.nlp.process.DocumentPreprocessor;

/**
* TokenizeText uses the Lucene libraries StandardAnalyzer class to tokenize a
* raw text input. A list of the stopwords used is available {@link StopWords}.
* Output is a pig bag containing tokens.
*
* <dt><b>Example:</b></dt>
* <dd><code>
* register varaha.jar;<br/>
* documents = LOAD 'documents' AS (doc_id:chararray, text:chararray);<br/>
* tokenized = FOREACH documents GENERATE doc_id AS doc_id, FLATTEN(StanfordTokenize(text)) AS (token:chararray);
* </code></dd>
* </dl>
*
* @see
* @author Russell Jurney
*
*/
public class SentenceTokenize extends EvalFunc<DataBag> {

private static TupleFactory tupleFactory = TupleFactory.getInstance();
private static BagFactory bagFactory = BagFactory.getInstance();

public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() < 1 || input.isNull(0))
return null;

// Output bag
DataBag bagOfSentences = bagFactory.newDefaultBag();

StringReader textInput = new StringReader(input.get(0).toString());
DocumentPreprocessor dp = new DocumentPreprocessor(textInput);
for (List sentence : dp) {
DataBag sentenceBag = bagFactory.newDefaultBag();
ListIterator<Object> sli = sentence.listIterator();
while(sli.hasNext())
{
String word = sli.next().toString();
Tuple termText = tupleFactory.newTuple(word);
sentenceBag.add(termText);
}
Tuple sentenceTuple = tupleFactory.newTuple(sentenceBag);
bagOfSentences.add(sentenceTuple);
}
return bagOfSentences;
}
}
130 changes: 130 additions & 0 deletions src/main/java/varaha/text/StanfordPOSTag.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package varaha.text;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.process.DocumentPreprocessor;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.BagFactory;

import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.ling.Word;

/**
* StanfordPOSTagger uses the Stanford Maximum Entropy Tagger class to Part-Of-Speech tag a
* raw text input. Output is a pig bag containing two-field tuples, of the format (word, tag).
*
* <dt><b>Example:</b></dt>
* <dd><code>
* register varaha.jar;<br/>
* documents = LOAD 'documents' AS (doc_id:chararray, text:chararray);<br/>
* tokenized = FOREACH documents GENERATE doc_id AS doc_id, StanfordPOSTagger(text)
* AS (b:bag{token:tuple(word:chararray, tag:chararray)});
* </code></dd>
* </dl>
*
* @see
* @author Russell Jurney
*
*/
public class StanfordPOSTag extends EvalFunc<DataBag> {

private static TupleFactory tupleFactory = TupleFactory.getInstance();
private static BagFactory bagFactory = BagFactory.getInstance();
private static boolean isFirst = true;
private static MaxentTagger tagger;

// Must also add implementation for bag sof tuples of sentences
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() < 1 || input.isNull(0))
return null;

if(isFirst)
{
try {
tagger = new MaxentTagger("edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger");
}
catch(Exception e) {
System.err.println("Exception loading language model: " + e.getMessage());
}
isFirst = false;
}

// Output bag
DataBag bagOfTokens = bagFactory.newDefaultBag();

Object inThing = input.get(0);
if(inThing instanceof String) {
StringReader textInput = new StringReader((String)inThing);
Tuple termText = null;
List<TaggedWord> taggedSentence = null;
DocumentPreprocessor dp = new DocumentPreprocessor(textInput);
for (List sentence : dp) {
taggedSentence = tagger.apply(sentence);

// Now split based on '_' and build/return a bag of 2-field tuples
termText = tupleFactory.newTuple();
for (TaggedWord word : taggedSentence ) {
String token = word.word();
String tag = word.tag();
termText = tupleFactory.newTuple(Arrays.asList(token, tag));
bagOfTokens.add(termText);
}
}
bagOfTokens.add(termText);
}
else if(inThing instanceof DataBag) {
Iterator<Tuple> itr = ((DataBag)inThing).iterator();
List<Word> sentence = new ArrayList<Word>();
while(itr.hasNext()) {
Tuple t = itr.next();
if(t.get(0) != null) {
Word word = new Word(t.get(0).toString());
sentence.add(word);
}
}
List<TaggedWord> taggedSentence = tagger.apply(sentence);
for( TaggedWord word : taggedSentence) {
String token = word.word();
String tag = word.tag();
Tuple termText = tupleFactory.newTuple(Arrays.asList(token, tag));
bagOfTokens.add(termText);
}
}
else
{
throw new IOException();
}

return bagOfTokens;
}
}
12 changes: 7 additions & 5 deletions src/main/java/varaha/text/StanfordTokenize.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
* </dl>
*
* @see
* @author Jacob Perkins
* @author Russell Jurney
*
*/
public class StanfordTokenize extends EvalFunc<DataBag> {
Expand All @@ -74,10 +74,12 @@ public DataBag exec(Tuple input) throws IOException {

for (CoreLabel label; ptbt.hasNext(); ) {
label = (CoreLabel)ptbt.next();
Tuple termText = tupleFactory.newTuple(label.toString());
bagOfTokens.add(termText);
}

if(label.value().length() > 2)
{
Tuple termText = tupleFactory.newTuple(label.word());
bagOfTokens.add(termText);
}
}
return bagOfTokens;
}
}