#!/usr/bin/env python3 """ Modify a sequence of phonemes based on some rules like the ones in the Great Vowel Shift """ import collections CMUDICT_FILENAME = "cmudict-basic.txt" BABYNAME_FILENAME = "../babynames/yob1984.txt" # why 1984? why not? CATEGORIES = {} TYPES = {} def read_phonemes(): """TYPES[phoneme] is 'vowel' etc.; CATEGORIES['vowel'] is a list of phonemes""" types, categories = {}, {} for line in open("cmudict/cmudict.phones", 'r'): ph, category = line.strip().split() types[ph] = category categories.setdefault(category, []).append(ph) return types, categories TYPES, CATEGORIES = read_phonemes() def read_cmudict(): """Make dictionary {word: [phoneme, ...]}""" cmudict = {} for line in open(CMUDICT_FILENAME, 'r'): words = line.strip().split() cmudict[words[0]] = words[1:] return cmudict CMUDICT = read_cmudict() def read_babynames(): """Make list [name, ...]""" names = [] for line in open(BABYNAME_FILENAME, 'r'): names.append(line.split(',', 1)[0].lower()) return names NAMES = read_babynames() def segment_into_vowel_consonant_subsequences(phonemes): """Split [phoneme, ...] into [[index, ...], ...] grouped by consonant/vowel""" # NOTE: For now, it'll be [(i, j, k), ...] where # phonemes[i:j] are consonants and phonemes[j:k] are vowels # and k from one group == i from the next group, for convenience groups = [] cur = 0 while cur < len(phonemes): consonant = cur while cur < len(phonemes) and TYPES[phonemes[cur]] != 'vowel': cur += 1 vowel = cur while cur < len(phonemes) and TYPES[phonemes[cur]] == 'vowel': cur += 1 groups.append((consonant, vowel, cur)) return groups def common_sequences(): """Generate list of most common consonant and vowel sequences in words""" C = collections.Counter() V = collections.Counter() for phonemes in CMUDICT.values(): for i, j, k in segment_into_vowel_consonant_subsequences(phonemes): C[tuple(phonemes[i:j])] += 1 V[tuple(phonemes[j:k])] += 1 return C, V def remove(phonemes, remove): """Remove should be a phoneme""" return [ph for ph in phonemes if ph not in remove] def subst(phonemes, mapping): """mapping keys are tuples of phones or string phoneme, values are lists of phonemes""" input = phonemes[:] output = [] while input: for before, after in mapping.items(): if type(before) == str: before = [before] if input[:len(before)] == list(before): if type(after) == str: output.append(after) else: output += list(after) del input[:len(before)] break else: output.append(input[0]) del input[0] return output def print_replacements(name, ph_from, category): phonemes = name.split() print() for ph_to in CATEGORIES[category]: replacement = subst(phonemes, {ph_from: ph_to}) if replacement != phonemes: print(' '.join(replacement)) def main(): print_replacements("M AY K AH L", 'AY', 'vowel') print_replacements("M AY K AH L", 'K', 'stop') print_replacements("M AY K AH L", 'M', 'nasal') print_replacements("M AY K AH L", 'M', 'fricative') print_replacements("K R IH S T AH F ER", 'IH', 'vowel') print_replacements("K R IH S T AH F ER", 'K', 'stop') print_replacements("K R IH S T AH F ER", 'K', 'fricative') print_replacements("K R IH S T AH F ER", ('K', 'R'), 'fricative') print_replacements("S T EH F AH N IY", 'EH', 'vowel') print_replacements("S T EH F AH N IY", 'S', 'stop') print_replacements("S T EH F AH N IY", 'S', 'fricative') print_replacements("S T EH F AH N IY", ('S', 'T'), 'fricative') print_replacements("JH EH N AH F ER", 'EH', 'vowel') print_replacements("JH EH N AH F ER", 'N', 'nasal') print_replacements("JH EH N AH F ER", 'N', 'fricative') print_replacements("JH EH N AH F ER", 'JH', 'fricative') print_replacements("JH EH N AH F ER", 'F', 'fricative') print_replacements("D AE N Y AH L", 'AE', 'vowel') print_replacements("D AE N Y AH L", ('Y', 'AH'), 'vowel') print_replacements("D AE N Y AH L", 'D', 'stop') print_replacements("D AE N Y AH L", 'D', 'fricative') print_replacements("D AE N Y AH L", 'L', 'liquid') print_replacements("L OW G AH N", 'L', 'liquid') print_replacements("L OW G AH N", 'L', 'fricative') print_replacements("L OW G AH N", 'OW', 'vowel') print_replacements("V IH K IY", 'V', 'fricative') print_replacements("V IH K IY", 'V', 'stop') print_replacements("V IH K IY", 'V', 'nasal') print_replacements("V IH K IY", 'IH', 'vowel') print_replacements("V IH K IY", 'K', 'fricative') print_replacements("V IH K IY", 'K', 'stop') print_replacements("V IH K IY", 'K', 'nasal') print_replacements("V IH K IY", 'IY', 'vowel') print_replacements("R AY L IY", 'R', 'fricative') print_replacements("R AY L IY", 'AY', 'vowel') print_replacements("R AY L IY", 'L', 'fricative') print_replacements("R AY L IY", 'IY', 'vowel') print_replacements("EH V AH L IH N", "EH", 'vowel') print_replacements("EH V AH L IH N", "V", 'fricative') print_replacements("EH V AH L IH N", "AH", 'vowel') print_replacements("EH V AH L IH N", "L", 'fricative') print_replacements("EH V AH L IH N", "IH", 'vowel') print_replacements("EH V AH L IH N", "N", 'fricative') print() # https://en.wikipedia.org/wiki/Great_Vowel_Shift # print(' '.join(subst(, { # 'AY': 'IY', # 'IY': 'EH', # 'EY': 'AA', # 'AW': 'UW', # 'UW': 'OW', # 'OW': 'AO', # }))) C, V = common_sequences() CATEGORIES['top-vowel'] = [x for x,y in V.most_common(50) if x] CATEGORIES['top-consonant'] = [x for x,y in C.most_common(60) if x] # print('#', ' '.join(['+'.join(x) for x in CATEGORIES['top-vowel']])) # print('#', ' '.join(['+'.join(x) for x in CATEGORIES['top-consonant']])) # AH IH IY EH AA ER AE OW EY AO AY UW AW IY+AH UH OY IY+OW ER+IH ER+AH # N L K M T S D R B Z P V F G HH NG SH S+T W JH N+T N+D CH N+Z N+S T+S P+R K+S B+R # NOTE: # It might be better, instead of having a global set of vowel and # consonant substitutions, to have the top 10 for a given previous- # and -next phoneme, including beginning-of-string and end-of-string. # That would capture things like Q often having a UW sound after it # instead of AH being the most common vowel to put there. def main2(): seen_already = set() # phonemes for name in NAMES: if name not in CMUDICT: continue phonemes = CMUDICT[name] if tuple(phonemes) in seen_already: continue # print(name) seen_already.add(tuple(phonemes)) with open("_input/" + name + ".txt", 'w') as output: groups = segment_into_vowel_consonant_subsequences(phonemes) print('#', name, file=output) print(' '.join(phonemes), file=output) for i, j, k in groups: print(file=output) for consonant in CATEGORIES['top-consonant']: print(' '.join(phonemes[:i] + list(consonant) + phonemes[j:]), file=output) print(file=output) for vowel in CATEGORIES['top-vowel']: print(' '.join(phonemes[:j] + list(vowel) + phonemes[k:]), file=output) main2() # for name in _input/*.txt; do output="_output/$(basename $name)"; if [ ! $output -nt $name ] ; then echo $name; python3 phoneme2grapheme.py --model my-g2p-big --decode $name >$output ; fi ; done