Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
1.4.0
--
част. снятие омонимии на морф. анализе; испр. нач.формы глагола; подд. е вместо ё; расширение словаря

1.2.16
--

Expand Down
13 changes: 12 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,13 @@
<parent>
<groupId>ru.textanalysis.tawt</groupId>
<artifactId>tawt-parent</artifactId>
<version>0.0.5</version>
<version>0.1.0</version>
</parent>

<properties>
<jsoup.version>1.14.2</jsoup.version>
</properties>

<dependencies>
<!-- jalexpr-->
<dependency>
Expand Down Expand Up @@ -49,6 +53,13 @@
<artifactId>lombok</artifactId>
<scope>provided</scope>
</dependency>

<!-- jsoup-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>${jsoup.version}</version>
</dependency>
</dependencies>

<build>
Expand Down
4 changes: 4 additions & 0 deletions src/main/java/ru/textanalysis/tawt/ms/constant/Const.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ public interface Const {
int BUFFER_SIZE_FOR_INSERT = 10_000;
String START_INSERT = "INSERT INTO 'Form' ('id','StringForm') VALUES ";
String CONTINUED_INSERT = "(%d, '%s')";

String TAB_SEPARATOR = "\t";
String COMMA_SEPARATOR = ",";
String SEMICOLON_SEPARATOR = ";";

String TAB_AND_COMMA_REGEX = "[,;]";
}
12 changes: 12 additions & 0 deletions src/main/java/ru/textanalysis/tawt/ms/constant/TypeOfSpeechs.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package ru.textanalysis.tawt.ms.constant;

public interface TypeOfSpeechs {

String VERB = "VERB";
String NUMR = "NUMR";
String ADJF = "ADJF";
String ADJS = "ADJS";
String PRTF = "PRTF";
String PRTS = "PRTS";
String INFN = "INFN";
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,22 +38,24 @@
package ru.textanalysis.tawt.ms.conversion.dictionary;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import ru.textanalysis.tawt.ms.loader.DatabaseFactory;
import ru.textanalysis.tawt.ms.loader.DatabaseLemmas;
import ru.textanalysis.tawt.ms.loader.DatabaseStrings;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.File;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;
import java.util.*;

import static ru.textanalysis.tawt.ms.constant.Const.COMMA_SEPARATOR;
import static ru.textanalysis.tawt.ms.constant.Const.TAB_SEPARATOR;
import static ru.textanalysis.tawt.ms.constant.TypeOfSpeechs.INFN;
import static ru.textanalysis.tawt.ms.constant.TypeOfSpeechs.VERB;

@Slf4j
public class ConversionDictionary {
Expand All @@ -63,51 +65,178 @@ public class ConversionDictionary {

public static void main(String[] args) {
ConversionDictionary conversionDictionary = new ConversionDictionary();
conversionDictionary.conversionDictionary("dict.opcorpora.txt", StandardCharsets.UTF_8);
conversionDictionary.conversionDictionary("dict.opcorpora.xml", StandardCharsets.UTF_8);
}

// todo: йо
public void conversionDictionary(String sourceDictionaryPath, Charset encoding) {
List<List<FormForConversion>> lemmas = convertLemmasFromInitDictionary(sourceDictionaryPath, encoding);
public void conversionDictionary(String sourceDictionaryPath, Charset encoding, String... additionalDictionaryPaths) {
List<List<FormForConversion>> lemmas = convertLemmasFromInitDictionary(sourceDictionaryPath, encoding, additionalDictionaryPaths);
databaseLemmas.recreate(lemmas);
databaseStrings.recreate(lemmas);

databaseStrings.compression();
databaseLemmas.compression();
}

private List<List<FormForConversion>> convertLemmasFromInitDictionary(String sourceDictionaryPath, Charset encoding) {
private List<List<FormForConversion>> convertLemmasFromInitDictionary(String sourceDictionaryPath, Charset encoding, String... additionalDictionaryPaths) {
List<String> dictionaryPaths = new ArrayList<>();
dictionaryPaths.add(sourceDictionaryPath);
dictionaryPaths.addAll(Arrays.asList(additionalDictionaryPaths));
List<List<FormForConversion>> lemmas = new ArrayList<>();
try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(sourceDictionaryPath), encoding))) {
while (bufferedReader.ready()) {
String initForm = bufferedReader.readLine();
if (StringUtils.isNotBlank(initForm) && !Pattern.matches("\\d+", initForm)) { //todo
List<FormForConversion> lemma = new LinkedList<>();
FormForConversion initialForm = createForm(initForm, true);
lemma.add(initialForm);
while (bufferedReader.ready()) {
String derivativeForm = bufferedReader.readLine();
if (StringUtils.isBlank(derivativeForm)) {
break;
HashMap<Integer, List<FormForConversion>> lemmasMap = new HashMap<>();
HashMap<Integer, List<String>> verbs = new HashMap<>();

try {
DocumentBuilder documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
for (String dictionaryPath : dictionaryPaths) {
Document document = documentBuilder.parse(new File(dictionaryPath).toURI().toString());

Node dictionary = document.getDocumentElement();
NodeList dictionaryProps = dictionary.getChildNodes();
for (int i = 0; i < dictionaryProps.getLength(); i++) {
Node node = dictionaryProps.item(i);
if (node.getNodeType() != Node.TEXT_NODE && (node.getNodeName().equals("grammemes") || node.getNodeName().equals("restrictions") || node.getNodeName().equals("link_types"))) {
while (node.hasChildNodes()) {
node.removeChild(node.getFirstChild());
}
}
}
for (int i = 0; i < dictionaryProps.getLength(); i++) {
Node lemmata = dictionaryProps.item(i);
if (lemmata.getNodeType() != Node.TEXT_NODE && lemmata.getNodeName().equals("lemmata")) {
NodeList lemmataProps = lemmata.getChildNodes();
for (int j = 0; j < lemmataProps.getLength(); j++) {
Node lemma = lemmataProps.item(j);
if (lemma.getNodeType() != Node.TEXT_NODE && lemma.getNodeName().equals("lemma")) {
String commonCharacteristics = "";
int formNumber = 0;
boolean isVerb = false;
List<String> verbInfn = new ArrayList<>();
List<FormForConversion> wordLemma = new LinkedList<>();
NodeList lemmaProps = lemma.getChildNodes();
for (int k = 0; k < lemmaProps.getLength(); k++) {
Node value = lemmaProps.item(k);
if (value.getNodeType() != Node.TEXT_NODE && value.getNodeName().equals("l")) {
NodeList valueProps = value.getChildNodes();
for (int m = 0; m < valueProps.getLength(); m++) {
Node characteristic = valueProps.item(m);
if (characteristic.getNodeType() != Node.TEXT_NODE && characteristic.getNodeName().equals("g")) {
commonCharacteristics += characteristic.getAttributes().getNamedItem("v").getNodeValue();
commonCharacteristics += COMMA_SEPARATOR;
}
}
if (commonCharacteristics.length() > 0) {
commonCharacteristics = commonCharacteristics.substring(0, commonCharacteristics.length() - 1);
}
} else if (value.getNodeType() != Node.TEXT_NODE && value.getNodeName().equals("f")) {
StringBuilder formCharacteristics = new StringBuilder(value.getAttributes().getNamedItem("t").getNodeValue());
formCharacteristics.append(TAB_SEPARATOR);
formCharacteristics.append(commonCharacteristics);
NodeList valueProps = value.getChildNodes();
for (int m = 0; m < valueProps.getLength(); m++) {
Node characteristic = valueProps.item(m);
if (characteristic.getNodeType() != Node.TEXT_NODE && characteristic.getNodeName().equals("g")) {
formCharacteristics.append(COMMA_SEPARATOR);
formCharacteristics.append(characteristic.getAttributes().getNamedItem("v").getNodeValue());
}
}
if (formCharacteristics.toString().contains(INFN) || formCharacteristics.toString().contains(VERB)) {
isVerb = true;
verbInfn.add(formCharacteristics.toString());
} else {
if (formNumber == 0) {
FormForConversion initialForm = createForm(formCharacteristics.toString(), true);
wordLemma.add(initialForm);
} else {
wordLemma.add(createForm(formCharacteristics.toString(), false));
}
formNumber++;
}
}
}
if (isVerb) {
verbs.put(Integer.valueOf(lemma.getAttributes().getNamedItem("id").getNodeValue()), verbInfn);
} else {
lemmasMap.put(Integer.valueOf(lemma.getAttributes().getNamedItem("id").getNodeValue()), wordLemma);
}
while (lemma.hasChildNodes()) {
lemma.removeChild(lemma.getFirstChild());
}
}
}
} else if (lemmata.getNodeType() != Node.TEXT_NODE && lemmata.getNodeName().equals("links")) {
NodeList lemmataProps = lemmata.getChildNodes();
for (int j = 0; j < lemmataProps.getLength(); j++) {
Node lemma = lemmataProps.item(j);
if (lemma.getNodeType() != Node.TEXT_NODE && lemma.getNodeName().equals("link")) {
if (Objects.equals(lemma.getAttributes().getNamedItem("type").getNodeValue(), "3")) {
List<FormForConversion> wordLemma = new LinkedList<>();
List<String> infn = verbs.get(Integer.parseInt(lemma.getAttributes().getNamedItem("from").getNodeValue()));
FormForConversion initialForm = createForm(infn.get(0).replaceAll(INFN, VERB), true);
wordLemma.add(initialForm);
List<String> verb = verbs.get(Integer.parseInt(lemma.getAttributes().getNamedItem("to").getNodeValue()));
verb.forEach(form -> {
FormForConversion derivativeForm = createForm(form, false);
wordLemma.add(derivativeForm);
});
lemmasMap.put(Integer.valueOf(lemma.getAttributes().getNamedItem("from").getNodeValue()), wordLemma);
} else if (!Objects.equals(lemma.getAttributes().getNamedItem("type").getNodeValue(), "11")) {
if (lemmasMap.containsKey(Integer.parseInt(lemma.getAttributes().getNamedItem("to").getNodeValue()))) {
lemmasMap.get(Integer.parseInt(lemma.getAttributes().getNamedItem("to").getNodeValue())).forEach(formLemma -> {
if (lemmasMap.containsKey(Integer.parseInt(lemma.getAttributes().getNamedItem("from").getNodeValue()))) {
formLemma.setLink(lemmasMap.get(Integer.parseInt(lemma.getAttributes().getNamedItem("from").getNodeValue())).get(0).hashCode(),
lemmasMap.get(Integer.parseInt(lemma.getAttributes().getNamedItem("from").getNodeValue())).get(0).getKey());
}
});
}
}
while (lemma.hasChildNodes()) {
lemma.removeChild(lemma.getFirstChild());
}
}
}
lemma.add(createForm(derivativeForm, false));
}
lemmas.add(lemma);
}
}
} catch (IOException ex) {
lemmas = new ArrayList<>(lemmasMap.values());
lemmasMap.clear();
verbs.clear();
lemmas.add(addSupportYo(lemmas));
} catch (Exception ex) {
log.error("Ошибка при чтении файла. Файл: {}", sourceDictionaryPath, ex);
}
return lemmas;
}

private List<FormForConversion> addSupportYo(List<List<FormForConversion>> lemmas) {
boolean firstYo = true;
List<FormForConversion> yoLemma = new LinkedList<>();
for (List<FormForConversion> lemma : lemmas) {
for (FormForConversion formForConversion : lemma) {
if (formForConversion.getStringName().contains("ё")) {
String curLemmaString = formForConversion.getStringName().split(TAB_SEPARATOR)[0];
if (firstYo) {
FormForConversion yoForm = createForm(curLemmaString.replaceAll("ё", "е"), true);
yoForm.setLink(formForConversion.hashCode(), formForConversion.getKey());
yoLemma.add(yoForm);
firstYo = false;
} else {
FormForConversion yoForm = createForm(curLemmaString.replaceAll("ё", "е"), false);
yoForm.setLink(formForConversion.hashCode(), formForConversion.getKey());
yoLemma.add(yoForm);
}
}
}
}
return yoLemma;
}

private FormForConversion createForm(String line, boolean isInitialForm) {
String[] parameters = line.toLowerCase(Locale.ROOT).split("\t");
String[] parameters = line.toLowerCase(Locale.ROOT).split(TAB_SEPARATOR);
FormForConversion form = new FormForConversion(parameters[0], isInitialForm);
if (parameters.length > 1) {
form.setCharacteristics(parameters[1].split("[, ]"));
} else {
System.out.println("error"); //todo
form.setCharacteristics(new String[0]);
}
return form;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,27 +23,43 @@ public class FormForConversion {
private final int key;
private byte partOfSpeech;
private byte[] morfCharacteristics;
private byte[] Link;
private boolean isFirstKey;

protected FormForConversion(String stringName, boolean isInitialForm) {
this.stringName = stringName.toLowerCase(Locale.ROOT);
key = createKey(isInitialForm);
long link = 0;
this.Link = getBytes(link);
}

public String getStringName() {
return stringName;
}

protected void setCharacteristics(String[] characteristics) {
List<String> parameters = new ArrayList<>(Arrays.asList(characteristics));
setPartOfSpeech(conversionPartOfSpeech(parameters));
setMorfCharacteristics(getBytes(conversionMorfCharacteristics(parameters)));
if (characteristics.length > 0) {
List<String> parameters = new ArrayList<>(Arrays.asList(characteristics));
setPartOfSpeech(conversionPartOfSpeech(parameters));
setMorfCharacteristics(getBytes(conversionMorfCharacteristics(parameters)));
} else {
setPartOfSpeech((byte) 0);
long chars = 0;
setMorfCharacteristics(getBytes(chars));
}
}

private void setPartOfSpeech(byte partOfSpeech) {
this.partOfSpeech = partOfSpeech;
}

protected void setLink(int hash, int key) {
long morphLink = hash;
morphLink = morphLink << 32;
morphLink += key;
this.Link = getBytes(morphLink);
}

private byte getPartOfSpeech() {
return partOfSpeech;
}
Expand Down Expand Up @@ -114,6 +130,10 @@ private byte[] getMorfCharacteristics() {
return morfCharacteristics;
}

private byte[] getLink() {
return Link;
}

public int getKey() {
return key;
}
Expand Down Expand Up @@ -157,6 +177,7 @@ public byte[] getByteFileFormat() {
bytesFormat = plusByte(bytesFormat, getPartOfSpeech());
}
bytesFormat = plusByte(bytesFormat, getMorfCharacteristics());
bytesFormat = plusByte(bytesFormat, getLink());
return bytesFormat;
}

Expand Down
Loading