shithub: zelda3

ref: 2c4de3dc497a89027aed78548436e64c3631499e
dir: /other/make_text_dict.py/

View raw version
import array

memos = {}
memoslist = []
def memo(s):
  m = memos.get(s)
  if m == None:
    m = len(memoslist)
    memos[s] = m
    memoslist.append(s)
  return m

def tos(s): return "".join(memoslist[c] for c in s)

lines = []
for line in open('dialogue.txt', 'r').read().splitlines():
  line = line.split(': ')[1]

  r = array.array('H')

  i = 0
  while i < len(line):
    if line[i] == '[':
      j = line.index(']', i + 1)
      r.append(memo(line[i:j+1]))
      i = j + 1
    else:
      r.append(memo(line[i]))
      i += 1
      
  #print(repr(line))
  #print(r)
  lines.append(list(r))
import collections


def find_all_ngrams(lines, N, cost):
  ctr = collections.Counter()
  for line in lines:
    for i in range(len(line) - N + 1):
      if line[i] != line[i+1]:
        ctr[tuple(line[i:i+N])] += 1
  r = list((b, a) for a, b in ctr.items() if b >= 2)
  if len(r) == 0:
    return None, 0
  b, a = max(r)
  return a, (N - cost) * b - N - 2 # 2 is the overhead of the dict

def find_best_ngram(cost):
  best_score=0

  for i in range(2, 32):
    text, score = find_all_ngrams(lines, i, cost)
    if score > best_score:
      best_score = score
      best_text = text
  return best_score, best_text

def update_ngrams(lines, replace_from, replace_to):
  for line in lines:
    for i in range(len(line) - len(replace_from) + 1):
      if tuple(line[i:i+len(replace_from)]) == replace_from:
        line[i:i+len(replace_from)] = replace_to

total_gain = 0

original_tokens = sum(len(line) for line in lines)


kTextDictionary_US = [
'    ', '   ', '  ', "'s ", 'and ', 
'are ', 'all ', 'ain', 'and', 'at ', 
'ast', 'an', 'at', 'ble', 'ba', 
'be', 'bo', 'can ', 'che', 'com', 
'ck', 'des', 'di', 'do', 'en ', 
'er ', 'ear', 'ent', 'ed ', 'en', 
'er', 'ev', 'for', 'fro', 'give ', 
'get', 'go', 'have', 'has', 'her', 
'hi', 'ha', 'ight ', 'ing ', 'in', 
'is', 'it', 'just', 'know', 'ly ', 
'la', 'lo', 'man', 'ma', 'me', 
'mu', "n't ", 'non', 'not', 'open', 
'ound', 'out ', 'of', 'on', 'or', 
'per', 'ple', 'pow', 'pro', 're ', 
're', 'some', 'se', 'sh', 'so', 
'st', 'ter ', 'thin', 'ter', 'tha', 
'the', 'thi', 'to', 'tr', 'up', 
'ver', 'with', 'wa', 'we', 'wh', 
'wi', 'you', 'Her', 'Tha', 'The', 
'Thi', 'You', 
]


dictionary = []

for i in range(111+256):
  best_score, best_text = find_best_ngram(1 if i < 111 else 2)
  if best_score == 0:
    break

  total_gain += best_score

  print(f'Removed best bigram "{tos(best_text)}" with gain {best_score}, total gain {total_gain} / {original_tokens}')

  dictionary.append(best_text)

  update_ngrams(lines, best_text, [memo('{%s}' % tos(best_text))])

#print('kTextDictionary_NEW = [')
#for i, d in enumerate(dictionary):
#  repl = tos(d).replace('{', '').replace('}', '')
#  print(f'{repr(repl)},')
#print(']')


for i, a in enumerate(lines):
  print(i, tos(a))