#!/usr/bin/env python3

"""
Modify a sequence of phonemes based on some rules
like the ones in the Great Vowel Shift
"""

import collections

CMUDICT_FILENAME = "cmudict-basic.txt"
BABYNAME_FILENAME = "../babynames/yob1984.txt" # why 1984? why not?

CATEGORIES = {}
TYPES = {}
def read_phonemes():
    """TYPES[phoneme] is 'vowel' etc.; CATEGORIES['vowel'] is a list of phonemes"""
    types, categories = {}, {}
    for line in open("cmudict/cmudict.phones", 'r'):
        ph, category = line.strip().split()
        types[ph] = category
        categories.setdefault(category, []).append(ph)
    return types, categories
TYPES, CATEGORIES = read_phonemes()

def read_cmudict():
    """Make dictionary {word: [phoneme, ...]}"""
    cmudict = {}
    for line in open(CMUDICT_FILENAME, 'r'):
        words = line.strip().split()
        cmudict[words[0]] = words[1:]
    return cmudict
CMUDICT = read_cmudict()


def read_babynames():
    """Make list [name, ...]"""
    names = []
    for line in open(BABYNAME_FILENAME, 'r'):
        names.append(line.split(',', 1)[0].lower())
    return names
NAMES = read_babynames()


def segment_into_vowel_consonant_subsequences(phonemes):
    """Split [phoneme, ...] into [[index, ...], ...] grouped by consonant/vowel"""
    # NOTE: For now, it'll be [(i, j, k), ...] where
    # phonemes[i:j] are consonants and phonemes[j:k] are vowels
    # and k from one group == i from the next group, for convenience
    groups = []
    cur = 0
    while cur < len(phonemes):
        consonant = cur
        while cur < len(phonemes) and TYPES[phonemes[cur]] != 'vowel':
            cur += 1
        vowel = cur
        while cur < len(phonemes) and TYPES[phonemes[cur]] == 'vowel':
            cur += 1
        groups.append((consonant, vowel, cur))
    return groups


def common_sequences():
    """Generate list of most common consonant and vowel sequences in words"""
    C = collections.Counter()
    V = collections.Counter()
    for phonemes in CMUDICT.values():
        for i, j, k in segment_into_vowel_consonant_subsequences(phonemes):
            C[tuple(phonemes[i:j])] += 1
            V[tuple(phonemes[j:k])] += 1

    return C, V

    
def remove(phonemes, remove):
    """Remove should be a phoneme"""
    return [ph for ph in phonemes if ph not in remove]


def subst(phonemes, mapping):
    """mapping keys are tuples of phones or string phoneme, values are lists of phonemes"""
    input = phonemes[:]
    output = []
    while input:
        for before, after in mapping.items():
            if type(before) == str: before = [before]
            if input[:len(before)] == list(before):
                if type(after) == str:
                    output.append(after)
                else:
                    output += list(after)
                del input[:len(before)]
                break
        else:
            output.append(input[0])
            del input[0]
    return output


def print_replacements(name, ph_from, category):
    phonemes = name.split()
    print()
    for ph_to in CATEGORIES[category]:
        replacement = subst(phonemes, {ph_from: ph_to})
        if replacement != phonemes:
            print(' '.join(replacement))

def main():
    print_replacements("M AY K AH L", 'AY', 'vowel')
    print_replacements("M AY K AH L", 'K', 'stop')
    print_replacements("M AY K AH L", 'M', 'nasal')
    print_replacements("M AY K AH L", 'M', 'fricative')

    print_replacements("K R IH S T AH F ER", 'IH', 'vowel')
    print_replacements("K R IH S T AH F ER", 'K', 'stop')
    print_replacements("K R IH S T AH F ER", 'K', 'fricative')
    print_replacements("K R IH S T AH F ER", ('K', 'R'), 'fricative')

    print_replacements("S T EH F AH N IY", 'EH', 'vowel')
    print_replacements("S T EH F AH N IY", 'S', 'stop')
    print_replacements("S T EH F AH N IY", 'S', 'fricative')
    print_replacements("S T EH F AH N IY", ('S', 'T'), 'fricative')

    print_replacements("JH EH N AH F ER", 'EH', 'vowel')
    print_replacements("JH EH N AH F ER", 'N', 'nasal')
    print_replacements("JH EH N AH F ER", 'N', 'fricative')
    print_replacements("JH EH N AH F ER", 'JH', 'fricative')
    print_replacements("JH EH N AH F ER", 'F', 'fricative')

    print_replacements("D AE N Y AH L", 'AE', 'vowel')
    print_replacements("D AE N Y AH L", ('Y', 'AH'), 'vowel')
    print_replacements("D AE N Y AH L", 'D', 'stop')
    print_replacements("D AE N Y AH L", 'D', 'fricative')
    print_replacements("D AE N Y AH L", 'L', 'liquid')

    print_replacements("L OW G AH N", 'L', 'liquid')
    print_replacements("L OW G AH N", 'L', 'fricative')
    print_replacements("L OW G AH N", 'OW', 'vowel')

    print_replacements("V IH K IY", 'V', 'fricative')
    print_replacements("V IH K IY", 'V', 'stop')
    print_replacements("V IH K IY", 'V', 'nasal')
    print_replacements("V IH K IY", 'IH', 'vowel')
    print_replacements("V IH K IY", 'K', 'fricative')
    print_replacements("V IH K IY", 'K', 'stop')
    print_replacements("V IH K IY", 'K', 'nasal')
    print_replacements("V IH K IY", 'IY', 'vowel')

    print_replacements("R AY L IY", 'R', 'fricative')
    print_replacements("R AY L IY", 'AY', 'vowel')
    print_replacements("R AY L IY", 'L', 'fricative')
    print_replacements("R AY L IY", 'IY', 'vowel')

    print_replacements("EH V AH L IH N", "EH", 'vowel')
    print_replacements("EH V AH L IH N", "V", 'fricative')
    print_replacements("EH V AH L IH N", "AH", 'vowel')
    print_replacements("EH V AH L IH N", "L", 'fricative')
    print_replacements("EH V AH L IH N", "IH", 'vowel')
    print_replacements("EH V AH L IH N", "N", 'fricative')
    
    print()
    
# https://en.wikipedia.org/wiki/Great_Vowel_Shift
# print(' '.join(subst(, {
#     'AY': 'IY',
#     'IY': 'EH',
#     'EY': 'AA',
#     'AW': 'UW',
#     'UW': 'OW',
#     'OW': 'AO',
#     })))


C, V = common_sequences()
CATEGORIES['top-vowel'] = [x for x,y in V.most_common(50) if x]
CATEGORIES['top-consonant'] = [x for x,y in C.most_common(60) if x]
# print('#', '  '.join(['+'.join(x) for x in CATEGORIES['top-vowel']]))
# print('#', '  '.join(['+'.join(x) for x in CATEGORIES['top-consonant']]))
# AH  IH  IY  EH  AA  ER  AE  OW  EY  AO  AY  UW  AW  IY+AH  UH  OY  IY+OW  ER+IH  ER+AH
# N  L  K  M  T  S  D  R  B  Z  P  V  F  G  HH  NG  SH  S+T  W  JH  N+T  N+D  CH  N+Z  N+S  T+S  P+R  K+S  B+R

# NOTE:
# It might be better, instead of having a global set of vowel and
# consonant substitutions, to have the top 10 for a given previous-
# and -next phoneme, including beginning-of-string and end-of-string.
# That would capture things like Q often having a UW sound after it
# instead of AH being the most common vowel to put there.

def main2():
    seen_already = set() # phonemes
    for name in NAMES:
        if name not in CMUDICT: continue
        phonemes = CMUDICT[name]
        if tuple(phonemes) in seen_already: continue
        # print(name)
        seen_already.add(tuple(phonemes))

        with open("_input/" + name + ".txt", 'w') as output:
            groups = segment_into_vowel_consonant_subsequences(phonemes)
            print('#', name, file=output)
            print(' '.join(phonemes), file=output)
            for i, j, k in groups:
                print(file=output)
                for consonant in CATEGORIES['top-consonant']:
                    print(' '.join(phonemes[:i] + list(consonant) + phonemes[j:]), file=output)
                print(file=output)
                for vowel in CATEGORIES['top-vowel']:
                    print(' '.join(phonemes[:j] + list(vowel) + phonemes[k:]), file=output)

main2()

# for name in _input/*.txt; do output="_output/$(basename $name)"; if [ ! $output -nt $name ] ; then echo $name; python3 phoneme2grapheme.py --model my-g2p-big --decode $name >$output ; fi ; done