Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Info #7

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 80 additions & 25 deletions lib/proiel/cli/converters/conll-u.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ def process(tb, options = [])
source.divs.each do |div|
div.sentences.each do |sentence|
sentence_count += 1
n = Sentence.new sentence
begin
n = Sentence.new sentence
rescue => e
STDERR.puts "Cannot initialize #{sentence.id} (#{sentence.citation}): #{e}"
end
begin
# Do the conversion first to avoid spurious headers if the conversion fails
a = n.convert.to_conll
Expand Down Expand Up @@ -51,8 +55,9 @@ def initialize(sentence)
# keep track of how many new tokens have been created
offset = 0

sentence.tokens.reject { |t| t.empty_token_sort == 'P' }.each do |tk|
sentence.tokens.each do |tk|

# deal with tokens with space
if tk.form =~ /[[:space:]]/
subtoks = tk.form.split(/[[:space:]]/)

Expand Down Expand Up @@ -84,28 +89,62 @@ def initialize(sentence)
end
end


tks.map(&:id).each_with_index.each do |id, i|
id_to_number[id] = i + 1
# here we need to do some tricks with empty tokens
# empty C and V tokens always get high token numbers, so nothing to worry about
# but empty P tokens are placed around their verbs
# initialize token numbering at 1
i = 1
tks.each do |tk|
if tk.empty_token_sort != "P"
id_to_number[tk.id] = i.to_s
i += 1
else
p_number = tk.head.dependents.select(&:pro?).find_index { |p| p == tk } + 1
# if we have a subject, it will come before the verb and
# the counter is not yet incremented. Objects and obliques
# will be placed after their verb, so the counter will
# have been incremented and we must substract one.

# For obliques, there is the added complication that there
# could be an overt object, potentially of dominating
# other tokens, which could intervene between the verb and
# the pro-drop oblique, causing the counter to be
# incremented.


if tk.relation == "sub"
h_number = i
elsif tk.relation == "obj"
h_number = i - 1
else
intervening_tokens = tks[(tks.find_index { |h| h.id == tk.head_id } + 1)..(tks.index(tk) - 1)].reject(&:pro?).length
h_number = i - 1 - intervening_tokens
end
id_to_number[tk.id] = "#{h_number.to_s}.#{(p_number).to_s}"
end
end

@tokens = tks.map do |t|

Token.new(id_to_number[t.id],
id_to_number[t.head_id],
#insert dots in any whitespace inside words and lemmata
t.form.to_s.gsub(/[[:space:]]/, '.'),
t.lemma.to_s.gsub(/[[:space:]]/, '.'),
(t.form.to_s == '' ? '_' : t.form.to_s.gsub(/[[:space:]]/, '.')),
(t.lemma.to_s == '' ? '_' : t.lemma.to_s.gsub(/[[:space:]]/, '.')),
t.part_of_speech,
t.language,
t.morphology,
t.relation,
t.empty_token_sort,
t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] },
t.citation_part,
t.id,
(t.information_status == "info_unannotatable" ? nil : t.information_status),
t.antecedent_id,
self
)
end
end.sort_by { |tk| [tk.id.split(".")[0].to_i, tk.id] }
# sort so that empty tokens come after their mother node
end

def convert
Expand All @@ -122,20 +161,24 @@ def distribute_conjunctions!
conjuncts = h.dependents.select { |d| d.relation == 'conj' }
conjunctions = h.dependents.select { |d| d.relation == 'cc' }
conjunctions.each do |c|
if c.id > h.id
new_head = conjuncts.select { |cj| cj.id > c.id }.first
if c.id.split(".")[0].to_i > h.id.split(".")[0].to_i
new_head = conjuncts.select { |cj| cj.id.split(".")[0].to_i > c.id.split(".")[0].to_i }.first
c.head_id = new_head.id if new_head
end
end
end
end

# TODO: to deal with empty tokens, we must use a string as index, not a number. This breaks some of these tests. Either
# 1) convert back to number here before we run the test
# 2) stay with id as a number, give empty p-tokens a sub_id attribute, and compose this at the end to give the id to use in the conll file
# 2) is probably best, but it may mess up with finding the heads of tokens? Because @id will not be unique
def check_directionality!
@tokens.select { |t| ['fixed', 'flat:foreign', 'flat:name'].include? t.relation }.each do |f|
f.promote!(nil, f.relation) if f.id < f.head.id
f.promote!(nil, f.relation) if f.id.split(".")[0].to_i < f.head.id.split(".")[0].to_i
end
@tokens.select { |t| t.relation == 'conj' }.each do |f|
raise "conj must go left-to-right" if f.id < f.head.id
raise "conj must go left-to-right but dependent #{f.id} is to the left of head #{f.head.id}" if f.id.split(".")[0].to_i < f.head.id.split(".")[0].to_i
end
end

Expand Down Expand Up @@ -209,17 +252,16 @@ def map_part_of_speech!
end

def restructure_graph!
@tokens.delete_if { |n| n.empty_token_sort == 'P' }
@tokens.select(&:preposition?).each(&:process_preposition!)
@tokens.select { |t| t.comparison_word? and t.dependents and t.dependents.select { |d| ['sub','obj','obl','comp','adv'].include?(d.relation) }.any? }.each(&:process_comparison!)
roots.each(&:change_coordinations!)
@tokens.select(&:copula?).each(&:process_copula!)
demote_subjunctions!
prune_empty_rootnodes!
# do ellipses from left to right for proper remnant treatment
@tokens.select(&:ellipsis?).sort_by { |e| e.left_corner.id }.each(&:process_ellipsis!)
@tokens.select(&:ellipsis?).sort_by { |e| e.left_corner.id.split(".")[0].to_i }.each(&:process_ellipsis!)
#NB! apos gets overridden by process_comparison so some dislocations are lost
@tokens.select { |t| t.relation == 'apos' and t.id < t.head_id }.each(&:process_dislocation!)
@tokens.select { |t| t.relation == 'apos' and t.id.split(".")[0].to_i < t.head_id.split(".")[0].to_i }.each(&:process_dislocation!)
# DIRTY: remove the rest of the empty nodes by attaching them
# to their grandmother with remnant. This is the best way to
# do it given the current state of the UDEP scheme, but
Expand All @@ -242,7 +284,7 @@ class Token
attr_reader :form
attr_reader :citation_part

def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, sentence)
def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, proiel_id, info_status, antecedent_id, sentence)
@id = id
@head_id = head_id
@form = form
Expand All @@ -257,6 +299,9 @@ def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, r
@sentence = sentence
@features = (morphology ? map_morphology(morphology) : '' )
@citation_part = 'ref=' + (citation_part ? citation_part : '').gsub(/\s/, '_')
@proiel_id = 'proiel-id=' + proiel_id.to_s
@info_status = info_status
@antecedent_id = antecedent_id
@upos = nil
end

Expand Down Expand Up @@ -485,21 +530,24 @@ def format_features(features)
end

def miscellaneous
m = @citation_part
m = @citation_part + "|" + @proiel_id
m += "|LId=#{@variant}" if @variant
m += "|information-status=#{@info_status}" if @info_status
m += "|antecedent-proiel-id=#{@antecedent_id.to_s}" if @antecedent_id
m
end

def to_conll
[@id,
@form,
@baselemma.gsub(/не\./,''),
@baselemma.to_s.gsub(/не\./,''),
@upos,
@part_of_speech,
(@part_of_speech || '_'),
format_features(@features),
@head_id,
(@head_id == 0 ? 'root' : @relation), # override non-root relations on root until we've found out how to handle unembedded reports etc
'_', # slashes here
(@empty_token_sort == "P" ? "_" : @head_id),
(@empty_token_sort == "P" ? "_" :
(@head_id == 0 ? 'root' : @relation)), # override non-root relations on root until we've found out how to handle unembedded reports etc
"#{@head_id}:#{@relation}", # slashes will eventually go here
miscellaneous].join("\t")
end

Expand Down Expand Up @@ -559,10 +607,16 @@ def find_relation possible_relations
end
end

# TODO: we want empty P tokens to have part of speech PRON and then head:nsubj/obj in the DEPS column and MISC as the others

def map_part_of_speech!
dependents.each(&:map_part_of_speech!)
possible_postags = POS_MAP[@part_of_speech]
find_postag possible_postags.dup
if @empty_token_sort == "P"
@upos = "PRON"
else
find_postag possible_postags.dup
end
# ugly, but the ugliness comes from UDEP
@upos = 'PRON' if @upos == 'DET' and @relation != 'det'
@upos = REL_TO_POS[@relation] if @upos == 'X'
Expand Down Expand Up @@ -691,9 +745,10 @@ def process_preposition!
mods.each { |m| m.head_id = obliques.first.id }
end

# TODO: this removes all empty nodes, irrespective of type. Does not work when we want to keep P type nodes
def remove_empties!
dependents.each(&:remove_empties!)
if is_empty?
if is_empty? and empty_token_sort != "P"
dependents.each { |d| d.head_id = head_id; d.relation = 'remnant' }
@sentence.remove_token! self
end
Expand All @@ -709,7 +764,7 @@ def process_coordination!
raise 'Only coordinations can be processed this way!' unless conjunction?
return if dependents.reject { |d| d.relation == 'aux' }.empty?
distribute_shared_modifiers!
dependents.reject { |d| d.relation == 'aux' }.sort_by { |d| d.left_corner.id }.first.promote!('conj', 'cc')
dependents.reject { |d| d.relation == 'aux' }.sort_by { |d| d.left_corner.id.split(".")[0].to_i }.first.promote!('conj', 'cc')
end

def distribute_shared_modifiers!
Expand Down