From 28610cd9e161fb634fc0e333daeed1d817defdbe Mon Sep 17 00:00:00 2001 From: Dag Haug Date: Sat, 3 Feb 2024 12:42:37 +0100 Subject: [PATCH 1/4] Add info structure annotation --- lib/proiel/cli/converters/conll-u.rb | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/lib/proiel/cli/converters/conll-u.rb b/lib/proiel/cli/converters/conll-u.rb index 2f30a9d..92fd6d4 100644 --- a/lib/proiel/cli/converters/conll-u.rb +++ b/lib/proiel/cli/converters/conll-u.rb @@ -103,6 +103,9 @@ def initialize(sentence) t.empty_token_sort, t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] }, t.citation_part, + t.id, + t.information_status, + t.antecedent_id, self ) end @@ -242,7 +245,7 @@ class Token attr_reader :form attr_reader :citation_part - def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, sentence) + def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, relation, empty_token_sort, slashes, citation_part, proiel_id, info_status, antecedent_id, sentence) @id = id @head_id = head_id @form = form @@ -257,6 +260,9 @@ def initialize(id, head_id, form, lemma, part_of_speech, language, morphology, r @sentence = sentence @features = (morphology ? map_morphology(morphology) : '' ) @citation_part = 'ref=' + (citation_part ? citation_part : '').gsub(/\s/, '_') + @proiel_id = 'proiel-id=' + proiel_id.to_s + @info_status = info_status + @antecedent_id = antecedent_id @upos = nil end @@ -485,8 +491,10 @@ def format_features(features) end def miscellaneous - m = @citation_part + m = @citation_part + "|" + @proiel_id m += "|LId=#{@variant}" if @variant + m += "|information-status=#{@info_status}" if @info_status + m += "|antecedent-proiel-id=#{@antecedent_id.to_s}" if @antecedent_id m end From 29320664abd7bea7c26f869824a55fb0378a1ba5 Mon Sep 17 00:00:00 2001 From: Dag Haug Date: Sun, 4 Feb 2024 20:15:37 +0100 Subject: [PATCH 2/4] Some work on empty tokens --- lib/proiel/cli/converters/conll-u.rb | 46 ++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/lib/proiel/cli/converters/conll-u.rb b/lib/proiel/cli/converters/conll-u.rb index 92fd6d4..202f042 100644 --- a/lib/proiel/cli/converters/conll-u.rb +++ b/lib/proiel/cli/converters/conll-u.rb @@ -51,8 +51,9 @@ def initialize(sentence) # keep track of how many new tokens have been created offset = 0 - sentence.tokens.reject { |t| t.empty_token_sort == 'P' }.each do |tk| + sentence.tokens.each do |tk| + # deal with tokens with space if tk.form =~ /[[:space:]]/ subtoks = tk.form.split(/[[:space:]]/) @@ -84,9 +85,20 @@ def initialize(sentence) end end + # here we need to do some tricks with empty tokens + # empty C and V tokens always get high token numbers, so nothing to worry about + # but empty P tokens are placed around their verbs + # initialize token numbering at 1 + i = 1 + tks.each do |tk| + if tk.empty_token_sort != "P" + id_to_number[tk.id] = i.to_s + i += 1 + else + p_number = tk.head.dependents.select { |d| d.empty_token_sort == "P" }.find_index { |p| p == tk } - tks.map(&:id).each_with_index.each do |id, i| - id_to_number[id] = i + 1 + id_to_number[tk.id] = "#{i.to_s}.#{(p_number + 1).to_s}" + end end @tokens = tks.map do |t| @@ -94,8 +106,8 @@ def initialize(sentence) Token.new(id_to_number[t.id], id_to_number[t.head_id], #insert dots in any whitespace inside words and lemmata - t.form.to_s.gsub(/[[:space:]]/, '.'), - t.lemma.to_s.gsub(/[[:space:]]/, '.'), + (t.form.to_s == '' ? '_' : t.form.to_s.gsub(/[[:space:]]/, '.')), + (t.lemma.to_s == '' ? '_' : t.lemma.to_s.gsub(/[[:space:]]/, '.')), t.part_of_speech, t.language, t.morphology, @@ -104,7 +116,7 @@ def initialize(sentence) t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] }, t.citation_part, t.id, - t.information_status, + t.information_status, # TODO we may want to get rid of the info_unannotatable tag here t.antecedent_id, self ) @@ -133,6 +145,10 @@ def distribute_conjunctions! end end + # TODO: to deal with empty tokens, we must use a string as index, not a number. This breaks some of these tests. Either + # 1) convert back to number here before we run the test + # 2) stay with id as a number, give empty p-tokens a sub_id attribute, and compose this at the end to give the id to use in the conll file + # 2) is probably best, but it may mess up with finding the heads of tokens? Because @id will not be unique def check_directionality! @tokens.select { |t| ['fixed', 'flat:foreign', 'flat:name'].include? t.relation }.each do |f| f.promote!(nil, f.relation) if f.id < f.head.id @@ -212,7 +228,6 @@ def map_part_of_speech! end def restructure_graph! - @tokens.delete_if { |n| n.empty_token_sort == 'P' } @tokens.select(&:preposition?).each(&:process_preposition!) @tokens.select { |t| t.comparison_word? and t.dependents and t.dependents.select { |d| ['sub','obj','obl','comp','adv'].include?(d.relation) }.any? }.each(&:process_comparison!) roots.each(&:change_coordinations!) @@ -501,13 +516,13 @@ def miscellaneous def to_conll [@id, @form, - @baselemma.gsub(/не\./,''), + @baselemma.to_s.gsub(/не\./,''), @upos, - @part_of_speech, + (@part_of_speech || '_'), format_features(@features), @head_id, (@head_id == 0 ? 'root' : @relation), # override non-root relations on root until we've found out how to handle unembedded reports etc - '_', # slashes here + "_", #"#{@head_id}:#{@relation}", # slashes will eventually go here miscellaneous].join("\t") end @@ -567,10 +582,16 @@ def find_relation possible_relations end end + # TODO: we want empty P tokens to have part of speech PRON and then head:nsubj/obj in the DEPS column and MISC as the others + def map_part_of_speech! dependents.each(&:map_part_of_speech!) possible_postags = POS_MAP[@part_of_speech] - find_postag possible_postags.dup + if @empty_token_sort == "P" + @upos = "PRON" + else + find_postag possible_postags.dup + end # ugly, but the ugliness comes from UDEP @upos = 'PRON' if @upos == 'DET' and @relation != 'det' @upos = REL_TO_POS[@relation] if @upos == 'X' @@ -699,9 +720,10 @@ def process_preposition! mods.each { |m| m.head_id = obliques.first.id } end + # TODO: this removes all empty nodes, irrespective of type. Does not work when we want to keep P type nodes def remove_empties! dependents.each(&:remove_empties!) - if is_empty? + if is_empty? and empty_token_sort != "P" dependents.each { |d| d.head_id = head_id; d.relation = 'remnant' } @sentence.remove_token! self end From bd18805b4d3da7c6a09a09cf53206b0274230b59 Mon Sep 17 00:00:00 2001 From: Dag Haug Date: Mon, 5 Feb 2024 21:21:50 +0100 Subject: [PATCH 3/4] Finished work on empty tokens --- lib/proiel/cli/converters/conll-u.rb | 36 ++++++++++++++++------------ 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/lib/proiel/cli/converters/conll-u.rb b/lib/proiel/cli/converters/conll-u.rb index 202f042..dbdd733 100644 --- a/lib/proiel/cli/converters/conll-u.rb +++ b/lib/proiel/cli/converters/conll-u.rb @@ -95,9 +95,13 @@ def initialize(sentence) id_to_number[tk.id] = i.to_s i += 1 else - p_number = tk.head.dependents.select { |d| d.empty_token_sort == "P" }.find_index { |p| p == tk } - - id_to_number[tk.id] = "#{i.to_s}.#{(p_number + 1).to_s}" + p_number = tk.head.dependents.select { |d| d.empty_token_sort == "P" }.find_index { |p| p == tk } + 1 + # if we have a subject, it will come before the verb and + # the counter is not yet incremented. Objects and obliques + # will be placed after their verb, so the counter will + # have been incremented and we must substract one. + tk.relation == "sub" ? h_number = i : h_number = i -1 + id_to_number[tk.id] = "#{h_number.to_s}.#{(p_number).to_s}" end end @@ -116,11 +120,12 @@ def initialize(sentence) t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] }, t.citation_part, t.id, - t.information_status, # TODO we may want to get rid of the info_unannotatable tag here + (t.information_status == "info_unannotatable" ? nil : t.information_status), t.antecedent_id, self ) - end + end.sort_by { |tk| [tk.id.split(".")[0].to_i, tk.id] } + # sort so that empty tokens come after their mother node end def convert @@ -137,8 +142,8 @@ def distribute_conjunctions! conjuncts = h.dependents.select { |d| d.relation == 'conj' } conjunctions = h.dependents.select { |d| d.relation == 'cc' } conjunctions.each do |c| - if c.id > h.id - new_head = conjuncts.select { |cj| cj.id > c.id }.first + if c.id.split(".")[0].to_i > h.id.split(".")[0].to_i + new_head = conjuncts.select { |cj| cj.id.split(".")[0].to_i > c.id.split(".")[0].to_i }.first c.head_id = new_head.id if new_head end end @@ -151,10 +156,10 @@ def distribute_conjunctions! # 2) is probably best, but it may mess up with finding the heads of tokens? Because @id will not be unique def check_directionality! @tokens.select { |t| ['fixed', 'flat:foreign', 'flat:name'].include? t.relation }.each do |f| - f.promote!(nil, f.relation) if f.id < f.head.id + f.promote!(nil, f.relation) if f.id.split(".")[0].to_i < f.head.id.split(".")[0].to_i end @tokens.select { |t| t.relation == 'conj' }.each do |f| - raise "conj must go left-to-right" if f.id < f.head.id + raise "conj must go left-to-right but dependent #{f.id} is to the left of head #{f.head.id}" if f.id.split(".")[0].to_i < f.head.id.split(".")[0].to_i end end @@ -235,9 +240,9 @@ def restructure_graph! demote_subjunctions! prune_empty_rootnodes! # do ellipses from left to right for proper remnant treatment - @tokens.select(&:ellipsis?).sort_by { |e| e.left_corner.id }.each(&:process_ellipsis!) + @tokens.select(&:ellipsis?).sort_by { |e| e.left_corner.id.split(".")[0].to_i }.each(&:process_ellipsis!) #NB! apos gets overridden by process_comparison so some dislocations are lost - @tokens.select { |t| t.relation == 'apos' and t.id < t.head_id }.each(&:process_dislocation!) + @tokens.select { |t| t.relation == 'apos' and t.id.split(".")[0].to_i < t.head_id.split(".")[0].to_i }.each(&:process_dislocation!) # DIRTY: remove the rest of the empty nodes by attaching them # to their grandmother with remnant. This is the best way to # do it given the current state of the UDEP scheme, but @@ -520,9 +525,10 @@ def to_conll @upos, (@part_of_speech || '_'), format_features(@features), - @head_id, - (@head_id == 0 ? 'root' : @relation), # override non-root relations on root until we've found out how to handle unembedded reports etc - "_", #"#{@head_id}:#{@relation}", # slashes will eventually go here + (@empty_token_sort == "P" ? "_" : @head_id), + (@empty_token_sort == "P" ? "_" : + (@head_id == 0 ? 'root' : @relation)), # override non-root relations on root until we've found out how to handle unembedded reports etc + "#{@head_id}:#{@relation}", # slashes will eventually go here miscellaneous].join("\t") end @@ -739,7 +745,7 @@ def process_coordination! raise 'Only coordinations can be processed this way!' unless conjunction? return if dependents.reject { |d| d.relation == 'aux' }.empty? distribute_shared_modifiers! - dependents.reject { |d| d.relation == 'aux' }.sort_by { |d| d.left_corner.id }.first.promote!('conj', 'cc') + dependents.reject { |d| d.relation == 'aux' }.sort_by { |d| d.left_corner.id.split(".")[0].to_i }.first.promote!('conj', 'cc') end def distribute_shared_modifiers! From 685651f07d21796346838aa7078f2caea535c320 Mon Sep 17 00:00:00 2001 From: Dag Haug Date: Sat, 10 Feb 2024 16:48:12 +0100 Subject: [PATCH 4/4] Fixed empty OBL numbering --- lib/proiel/cli/converters/conll-u.rb | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/lib/proiel/cli/converters/conll-u.rb b/lib/proiel/cli/converters/conll-u.rb index dbdd733..997e393 100644 --- a/lib/proiel/cli/converters/conll-u.rb +++ b/lib/proiel/cli/converters/conll-u.rb @@ -15,7 +15,11 @@ def process(tb, options = []) source.divs.each do |div| div.sentences.each do |sentence| sentence_count += 1 - n = Sentence.new sentence + begin + n = Sentence.new sentence + rescue => e + STDERR.puts "Cannot initialize #{sentence.id} (#{sentence.citation}): #{e}" + end begin # Do the conversion first to avoid spurious headers if the conversion fails a = n.convert.to_conll @@ -95,12 +99,27 @@ def initialize(sentence) id_to_number[tk.id] = i.to_s i += 1 else - p_number = tk.head.dependents.select { |d| d.empty_token_sort == "P" }.find_index { |p| p == tk } + 1 + p_number = tk.head.dependents.select(&:pro?).find_index { |p| p == tk } + 1 # if we have a subject, it will come before the verb and # the counter is not yet incremented. Objects and obliques # will be placed after their verb, so the counter will # have been incremented and we must substract one. - tk.relation == "sub" ? h_number = i : h_number = i -1 + + # For obliques, there is the added complication that there + # could be an overt object, potentially of dominating + # other tokens, which could intervene between the verb and + # the pro-drop oblique, causing the counter to be + # incremented. + + + if tk.relation == "sub" + h_number = i + elsif tk.relation == "obj" + h_number = i - 1 + else + intervening_tokens = tks[(tks.find_index { |h| h.id == tk.head_id } + 1)..(tks.index(tk) - 1)].reject(&:pro?).length + h_number = i - 1 - intervening_tokens + end id_to_number[tk.id] = "#{h_number.to_s}.#{(p_number).to_s}" end end