qsomap/itu_prefixes/preprocess.py

113 lines
3.0 KiB
Python
Executable File

#!/usr/bin/env python3
import sys
filename_in = sys.argv[1]
# maps country names to a list of prefixes
countrymap = {}
# load all countries
with open(filename_in, 'r') as infile:
first_line = True
for line in infile:
if first_line:
first_line = False
continue
parts = line.strip().split("\t")
print(len(parts))
if len(parts) != 2:
continue
if parts[1] not in countrymap:
countrymap[parts[1]] = []
countrymap[parts[1]].append(parts[0])
print(f"{parts[1]} => {parts[0]}")
for country in countrymap.keys():
raw_prefixes = countrymap[country]
ranges = []
for prefixrange in raw_prefixes:
parts = prefixrange.split(" - ")
ranges += [tuple(parts)]
ranges.sort()
minimized_prefixes_1 = []
# eliminate all ranges that completely cover A-Z
for rng in ranges:
if rng[0][-1] == 'A' and rng[1][-1] == 'Z':
minimized_prefixes_1.append(rng[0][:2])
else:
minimized_prefixes_1.append(rng)
# consolidate higher-level ranges
minimized_prefixes = []
start = 0
while start < len(minimized_prefixes_1):
first_prefix = minimized_prefixes_1[start]
if type(first_prefix) == tuple:
minimized_prefixes.append(first_prefix)
start += 1
else:
reference = first_prefix[0]
end = start
while end < len(minimized_prefixes_1) and \
minimized_prefixes_1[end][0] == reference:
end += 1
# end now marks the first prefix that does not match the reference
last_prefix = minimized_prefixes_1[end-1]
if first_prefix == last_prefix:
minimized_prefixes.append(first_prefix)
elif first_prefix[-1] == 'A' and last_prefix[-1] == 'Z':
minimized_prefixes.append(first_prefix[:-1])
else:
minimized_prefixes.append((first_prefix, last_prefix))
start = end
countrymap[country] = minimized_prefixes
print(f"{country:30s} => {minimized_prefixes}")
with open("country2call.tsv", "w") as country2callfile:
for country in sorted(countrymap.keys()):
prefixes = countrymap[country]
formatted_list = []
for p in prefixes:
if type(p) == tuple:
formatted_list.append("-".join(p))
else:
formatted_list.append(p)
country2callfile.write(country + "\t" + ", ".join(formatted_list) + "\n")
with open("call2country.tsv", "w") as call2countryfile:
# reverse the dict direction
call2country = {}
for k, v in countrymap.items():
for p in v:
if type(p) == tuple:
newk = "-".join(p)
else:
newk = p
call2country[newk] = k
for k in sorted(call2country.keys()):
call2countryfile.write(k + "\t" + call2country[k] + "\n")