diff --git a/data/data_generator.rb b/data/data_generator.rb index 293d0f5..84836b0 100644 --- a/data/data_generator.rb +++ b/data/data_generator.rb @@ -67,7 +67,7 @@ # authorization of the copyright holder. -$ignorable_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m] +$ignorable_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m] $ignorable = [] $ignorable_list.each_line do |entry| if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ @@ -77,7 +77,7 @@ $ignorable_list.each_line do |entry| end end -$uppercase_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m] +$uppercase_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Uppercase.*?# Total code points:/m] $uppercase = [] $uppercase_list.each_line do |entry| if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ @@ -87,7 +87,7 @@ $uppercase_list.each_line do |entry| end end -$lowercase_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m] +$lowercase_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Lowercase.*?# Total code points:/m] $lowercase = [] $lowercase_list.each_line do |entry| if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ @@ -97,7 +97,7 @@ $lowercase_list.each_line do |entry| end end -$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt") +$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt", :encoding => 'utf-8') $grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER") $grapheme_boundclass_list.each_line do |entry| if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/ @@ -107,7 +107,7 @@ $grapheme_boundclass_list.each_line do |entry| end end -$emoji_data_list = File.read("emoji-data.txt") +$emoji_data_list = File.read("emoji-data.txt", :encoding => 'utf-8') $emoji_data_list.each_line do |entry| if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W/ $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" } @@ -120,7 +120,7 @@ $emoji_data_list.each_line do |entry| end end -$charwidth_list = File.read("CharWidths.txt") +$charwidth_list = File.read("CharWidths.txt", :encoding => 'utf-8') $charwidth = Hash.new(0) $charwidth_list.each_line do |entry| if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([0-9]+)/ @@ -130,13 +130,13 @@ $charwidth_list.each_line do |entry| end end -$exclusions = File.read("CompositionExclusions.txt")[/# \(1\) Script Specifics.*?# Total code points:/m] +$exclusions = File.read("CompositionExclusions.txt", :encoding => 'utf-8')[/# \(1\) Script Specifics.*?# Total code points:/m] $exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex } -$excl_version = File.read("CompositionExclusions.txt")[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m] +$excl_version = File.read("CompositionExclusions.txt", :encoding => 'utf-8')[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m] $excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex } -$case_folding_string = File.open("CaseFolding.txt", :encoding => 'utf-8').read +$case_folding_string = File.read("CaseFolding.txt", :encoding => 'utf-8') $case_folding = {} $case_folding_string.chomp.split("\n").each do |line| next unless line =~ /([0-9A-F]+); [CF]; ([0-9A-F ]+);/i