HKY+IGC free tau¶
An example of a molecular model of evolution of gene duplicates using the following molecular data from primates.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | >OrangutanECP
ATGGTTCCAAAACTGTTCACTTCCCAAATTTGTCTGCTTCTTCTGTTGGGGCTTAGTGGTGTGGGGGGCTCACTCCATGCCAAACCCCGACAGTTTACGAGGGCTCAGTGGTTTGCCATCCAGCACGTCAGTCTGAACCCTCCTCAATGCACCACTGCAATGCGGGTAATTAACAATTATCAACGGCGTTGCAAAGACCAAAATACTTTTCTTCGTACAACTTTTGCTAATGTAGTTAATGTTTGTGGTAACCCAAATATAACCTGTCCTCGTAACAGAACTCTCCACAATTGTCATCGGAGTAGATTCCAGGTGCCTTTACTCCACTGTAACCTCACAGGTCAGAATATTTCAAACTGCAAGTATGCAGACAGAACAGAAAGGAGGTTCTATGTAGTTGCATGTGACAACAGAGATCCACGGGATTCTCCACGGTATCCTGTGGTTCCAGTTCACCTGGATACCACCATCTAA
>OrangutanEDN
ATGGTTCCAAAACTGTTCACTTCTCAAATTTCCCTGCTTCTTCTGTTGGGGCTTCTGGCTGTGGACGGCTCACTCCATGTCAAACCTCCACAGTTTACCTGGGCTCAATGGTTTGAAACCCAGCACATCAATATGACCTCCCAGCAATGCAACAATGCAATGCAGGTCATTAACAATTTTCAACGGCGTTGCAAAAACCAAAATACTTTTCTGCGTACAACTTTTGCTAATGTAGTTAATGTTTGTGGTAACCCAAATATAACCTGTCCTAGTAACAGAAGTCGCAACAATTGTCATCATAGTGGAGTCCAGGTGCCTTTAATCCACTGTAACCTCACAAGTCAGAATATTTCAAACTGCAGGTATGCGCAGACACCAGCAAACATGTTCTATATAGTTGCATGTGACAACAGGGATCCACGGGACCCTCCACAGTATCCGGTGGTTCCAGTTCACCTGGATAGAATCATCTAA
>MacaqueECP
ATGGTTCCAAAACTGTTCACTTCCCAAATTTGTCTGCTTCTTCTGTTGGGGCTTATGGGTGTGGAGGGCTCACTCCATGCCAGACCCCCACAGTTTACAAAGGCTCAGTGGTTTGCCATCCAGCACATCAATGTGAACCCCCCTCGATGCACCATTGCAATGCGGGTAATAAATAATTATCAACGGCGTTGCAAAAACCAAAATACTTTTCTTCGTACAACTTTTGCATATACAGCTAATGTTTGTCGTAACGAACGTATACGCTGCCCTCGTAACAGAACTCTCCACAATTGTCATCGTAGTAGATACCGGGTGCCTTTACTCCACTGTGACCTCACAGGTCAGAATATTTCAACCTGCAGGTATGCAGACAGACCAGGACGGAGGTTCTATGTAGTTGCATGTGAAAGCAGAGATCCACGGGATTCTCCACGGTATCCAGTGGTTCCAGTTCACCTGGATACCACCATCTAA
>ChimpanzeeEDN
ATGGTTCCAAAACTGTTCACTTCCCAAATTTGTCTGCTTCTTCTGTTGGGGCTTCTGGCTGTGGAGGGCTCACTCCATGTCAAACCTCCACAGTTTACCTGGGCTCAATGGTTTGAAACCCAGCACATCAATATGACATCCCAGCAATGCACCAATGCAATGCAGGTCATTAACAATTATCAACGGCGATGCAAAAACCAAAATACTTTCCTTCTTACAACTTTTGCTAACGTAGTTAATGTTTGTGGTAACCCAAATATGACCTGTCCTAGTAACAAAACTCGCAAAAATTGTCATCAAAGTGGAAGCCAGGTGCCTTTAATCCACTGTAACCTCACAAGTCAGAATATTTCAAACTGCAGGTATGCGCAGACACCAGCAAACATGTTCTATATAGTTGCATGTGACAACAGAGATCAACGGGACCCTCCACAGTATCCGGTGGTTCCAGTTCACCTGGATAGAATCATCTAA
>MacaqueEDN
ATGGTTCCAAAACTGTTCACTTCCCAAATTTGTCTGCTTCTTCTGTTGGGGCTTATGGGTGTGGAAGGCTCACTTCATGCCAAACCCGGACAATTTACCTGGGCTCAGTGGTTTGAAATCCAGCATATAAATATGACCTCTGGCCAATGCACCAATGCAATGCAGGTCATTAACAATTATCAACGGCGATGCAAAAATCAAAATACTTTTCTTCTTACAACTTTTGCTGATGTAGTTCATGTCTGTGGTAACCCAAGCATGCCCTGCCCTAGCAACACAAGTCTCAACAATTGTCATCATAGTGGAGTCCAGGTGCCTTTAATCCACTGTAACCTCACAAGTCGAAGGATTTCAAATTGCAGGTATACACAGACAACAGCAAACAAGTACTACATAGTTGCATGTAACAACAGCGATCCACGGGACCCTCCACAGTATCCAGTGGTTCCAGTTCACCTGGATAGAATCATCTAA
>GorillaEDN
ATGGTTCCAAAACTGTTCACTTCCCAAATTTGTCTGCTTCTTCTGTTGGGGCTTCTGGCAGTGGAGGGCTCACTCCATGTCAAACCTCCACAGTTTACCTGGGCTCAATGGTTTGAAACCCAGCACATCAATATGACCTCCCAGCAATGCACCAATGCAATGCGGGTCATTAACAATTATCAACGGCGATGCAAAAACCAAAATACTTTCCTTCTTACAACTTTTGCTAACGTAGTTAATGTTTGTGGTAACCCAAATATGACCTGTCCTAGTAACAAAACTCGCAAAAATTGTCATCACAGTGGAAGCCAGGTGCCTTTAATCCACTGTAACCTCACAAGTCAGAATATTTCAAACTGCAGGTATGCGCAGACACCAGCAAACATGTTCTATATAGTTGCATGTGACAACAGAGATCAACGGGACCCTCCACAGTATCCAGTGGTTCCAGTTCACCTGGATAGAATCATCTAA
>GorillaECP
ATGGTTCCAAAACTGTTCACTTCCCAAATTTGTCTGCTTCTTCTGTTGGGGCTTATGGGTGTGGAGGGCTCACTCCATGCCAGACCCCCACAGTTTACGAGGGCTCAGTGGTTTGCCATCCAGCACATCAGTCTGAACCCCCCTCGATGCACCATTGCAATGCGGGTAATTAACAATTATCGATGGCGTTGCAAAAACCAAAATACTTTTCTTCGTACAACTTTTGCTAATGTAGTTAATGTTTGTGGTAACCAAAGTATACGCTGCCTTCATAACAGAACTCTCAACAATTGTCATCGGAGTAGATTCCGGGTGCCTTTACTCCACTGTGACCTCACAGGTCAGAATATTTCAAACTGCAGGTATGCAGACAGACCAGGAAGGAGGTTCTATGTAGTTGCATGTGACAACAGAGATCCACAGGATTCTCCACGGTATCCTGTGGTTCCTGTTCACCTGGATACCACCATCTAA
>TamarinEDN
ATGGTTCCAAAACTGTTCACTTCCCAAATTTGCGTGCTTCTTCTTTTCGGGCTTTTGAGTGTGGAGGTCTCACTCCAGGTCAAACCCCAGCAGTTTTCCTGGGCTCAGTGGTTTAGCATCCAGCACATCCAAACAACTCCCCTCCACTGCACCTCTGCAATGCGGGCAATTAACAGGTATCAACCTCGATGCAAAAACCAAAATACTTTTCTTCATACAACTTTTGCTAATGTAGTTAATGTTTGTGGTAACACAAATATCACCTGCCCTCGTAATGCATCTCTCAACAATTGTCATCACAGTGGAGTCCAGGTGCCTTTAACCTACTGTAACCTCACAGGTCAGACTATTTCAAACTGTGTGTATTCCTCGACTCAGGCAAACATGTTCTATGTAGTTGCATGTGACAACAGAGATCCACGGGATCCTCCACAGTATCCAGTGGTCCCGGTTCACCTGGATACCACCATCTAA
>ChimpanzeeECP
ATGGTTCCAAAACTGTTCACTTCCCAAATTTGTCTGCTTCTTCTGTTGGGGCTTATGGGTGTGGAGGGCTCACTCCATGCCAGACCCCCACAGTTTACGAGGGCTCAGTGGTTTGCCATCCAGCACATCAGTCTGAACCCCCCTCGATGCACCATTGCAATGCGGGTAATTAACAATTATCGATGGCGTTGCAAAAACCAAAATACTTTTCTTCGTACAACTTTTGCTAATGTAGTTAATGTTTGTGGTAACCAAAGTATACGCTGCCCTCATAACAGAACCCTCAACAATTGTCATCAGAGTAGATTCCGGGTGCCTTTACTCCACTGTGACCTCACAGGTCAGAATATTTCAAACTGCGGGTATGCAGACAGACCAGGAAGGAGGTTCTATGTAGTTGCATGTGACAACAGAGATCCACGGGATTCTCCACGGTATCCTGTGGTTCCAGTTCACCTGGATACCACCATCTAA
|
finite differences gradient¶
The following maximum likelihood estimation uses L-BFGS-B with finite-differences approximations of the gradient of the log likelihood.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | from __future__ import print_function, division, absolute_import
import functools
import itertools
import copy
import json
import numpy as np
from numpy.testing import assert_equal
from scipy.misc import logsumexp
from scipy.optimize import minimize
import jsonctmctree.interface
def hky(distn, k):
R = np.array([
[0, 1, k, 1],
[1, 0, 1, k],
[k, 1, 0, 1],
[1, k, 1, 0],
]) * distn
return R, R.sum(axis=1).dot(distn)
def gen_transitions(distn, kappa, tau):
R, expected_rate = hky(distn, kappa)
R = R / expected_rate
for i in range(4):
for j in range(4):
if i == j:
for k in range(4):
if i != k:
yield (i, j), (k, j), R[i, k]
if j != k:
yield (i, j), (i, k), R[j, k]
else:
yield (i, j), (i, i), R[j, i] + tau
yield (i, j), (j, j), R[i, j] + tau
for k in range(4):
if i != k and j != k:
yield (i, j), (k, j), R[i, k]
yield (i, j), (i, k), R[j, k]
def pack(distn, kappa, tau, rates):
return np.log(np.concatenate([distn, [kappa, tau], rates]))
def unpack(X):
lse = logsumexp(X[0:4])
unpacking_cost = lse * lse
distn = np.exp(X[0:4] - lse)
kappa, tau = np.exp(X[4:6])
rates = np.exp(X[6:])
return distn, kappa, tau, rates, unpacking_cost
def objective(scene, X):
distn, kappa, tau, rates, unpacking_cost = unpack(X)
scene['root_prior']['probabilities'] = distn.tolist()
scene['tree']['edge_rate_scaling_factors'] = rates.tolist()
triples = list(gen_transitions(distn, kappa, tau))
rows, cols, transition_rates = zip(*triples)
process_definition = {
'row_states' : [list(x) for x in rows],
'column_states' : [list(x) for x in cols],
'transition_rates' : list(transition_rates)
}
scene['process_definitions'] = [process_definition]
request = {'property' : 'snnlogl'}
j_in = {'scene' : scene, 'requests' : [request]}
j_out = jsonctmctree.interface.process_json_in(j_in)
log_likelihood = j_out['responses'][0]
cost = -log_likelihood + unpacking_cost
return cost
def main():
name_to_node = {
'tamarin' : 0,
'macaque' : 1,
'orangutan' : 2,
'chimpanzee' : 3,
'gorilla' : 4}
paralog_to_variable = {
'ecp' : 0,
'edn' : 1}
nodes = []
variables = []
rows = []
with open('paralogs.fasta') as fin:
while True:
line = fin.readline().strip().lower()
if not line:
break
name = line[1:-3]
paralog = line[-3:]
seq = fin.readline().strip()
row = ['ACGT'.index(x) for x in seq]
nodes.append(name_to_node[name])
variables.append(paralog_to_variable[paralog])
rows.append(row)
columns = [list(x) for x in zip(*rows)]
distn = [0.25, 0.25, 0.25, 0.25]
rates = [1, 1, 1, 1, 1, 1, 1, 1]
scene = {
"node_count" : 9,
"process_count" : 1,
"state_space_shape" : [4, 4],
"tree" : {
"row_nodes" : [5, 5, 6, 6, 7, 7, 8, 8],
"column_nodes" : [0, 6, 1, 7, 2, 8, 3, 4],
"edge_rate_scaling_factors" : rates,
"edge_processes" : [0, 0, 0, 0, 0, 0, 0, 0]
},
"root_prior" : {
"states" : [[0, 0], [1, 1], [2, 2], [3, 3]],
"probabilities" : distn
},
"observed_data" : {
"nodes" : nodes,
"variables" : variables,
"iid_observations" : columns
}
}
X = pack(distn, 2.0, 3.0, rates)
f = functools.partial(objective, scene)
result = minimize(f, X, method='L-BFGS-B')
print('final value of objective function:', result.fun)
distn, kappa, tau, rates, unpacking_cost = unpack(result.x)
print('nucleotide distribution:')
for nt, p in zip('ACGT', distn):
print(' ', nt, ':', p)
print('kappa:', kappa)
print('tau:', tau)
print('edge rate scaling factors:')
for r in rates:
print(' ', r)
main()
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | final value of objective function: 1721.78345437
nucleotide distribution:
A : 0.282900197002
C : 0.255266259509
G : 0.207346511578
T : 0.254487031912
kappa: 2.11389841003
tau: 1.8204058747
edge rate scaling factors:
0.102974211706
0.0706132002156
0.0511441873578
0.00878905788968
0.0298626978464
0.0109179434163
0.00455556207782
0.00501138322281
|
explicit edge rate derivatives¶
This more complicated implementation still uses L-BFGS-B but with the edge-rate component of the gradient computed using calculus instead of finite differences. The entries of the gradient corresponding to the other parameters of the model (mutational nucleotide distribution, transition/transversion rate ratio, and gene conversion rate) are still estimated using a finite differences approximation.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | from __future__ import print_function, division, absolute_import
import functools
import json
import numpy as np
from numpy.testing import assert_equal
from scipy.misc import logsumexp
from scipy.optimize import minimize
import jsonctmctree.interface
def hky(distn, k):
R = np.array([
[0, 1, k, 1],
[1, 0, 1, k],
[k, 1, 0, 1],
[1, k, 1, 0],
]) * distn
return R, R.sum(axis=1).dot(distn)
def gen_transitions(distn, kappa, tau):
R, expected_rate = hky(distn, kappa)
R = R / expected_rate
for i in range(4):
for j in range(4):
if i == j:
for k in range(4):
if i != k:
yield (i, j), (k, j), R[i, k]
if j != k:
yield (i, j), (i, k), R[j, k]
else:
yield (i, j), (i, i), R[j, i] + tau
yield (i, j), (j, j), R[i, j] + tau
for k in range(4):
if i != k and j != k:
yield (i, j), (k, j), R[i, k]
yield (i, j), (i, k), R[j, k]
def pack(distn, kappa, tau, rates):
return np.log(np.concatenate([distn, [kappa, tau], rates]))
def unpack(X):
lse = logsumexp(X[0:4])
unpacking_cost = lse * lse
distn = np.exp(X[0:4] - lse)
kappa, tau = np.exp(X[4:6])
rates = np.exp(X[6:])
return distn, kappa, tau, rates, unpacking_cost
def get_process_defn_and_prior(distn, kappa, tau):
triples = list(gen_transitions(distn, kappa, tau))
rows, cols, transition_rates = zip(*triples)
process_definition = {
'row_states' : [list(x) for x in rows],
'column_states' : [list(x) for x in cols],
'transition_rates' : list(transition_rates)
}
root_prior = {
"states" : [[0, 0], [1, 1], [2, 2], [3, 3]],
"probabilities" : list(distn)
}
return process_definition, root_prior
def objective_and_gradient(scene, X):
delta = 1e-8
distn, kappa, tau, rates, unpacking_cost = unpack(X)
scene['tree']['edge_rate_scaling_factors'] = rates.tolist()
log_likelihood_request = {'property' : 'snnlogl'}
derivatives_request = {'property' : 'sdnderi'}
# Get the log likelihood and per-edge derivatives.
# Note that the edge derivatives are of the log likelihood
# with respect to logs of edge rates, and we will eventually
# multiply them by -1 to get the gradient of the cost function
# which we want to minimize rather than the log likelihood function
# which we want to maximize.
process_defn, root_prior = get_process_defn_and_prior(distn, kappa, tau)
scene['root_prior'] = root_prior
scene['process_definitions'] = [process_defn]
j_in = {
'scene' : scene,
'requests' : [log_likelihood_request, derivatives_request]
}
j_out = jsonctmctree.interface.process_json_in(j_in)
log_likelihood, edge_gradient = j_out['responses']
cost = -log_likelihood + unpacking_cost
# For each non-edge-specific parameter get finite-differences
# approximation of the gradient.
nedges = len(scene['tree']['row_nodes'])
nparams = len(X) - nedges
gradient = []
for i in range(nparams):
W = np.copy(X)
W[i] += delta
distn, kappa, tau, rates, unpacking_cost = unpack(W)
process_defn, root_prior = get_process_defn_and_prior(distn, kappa, tau)
scene['root_prior'] = root_prior
scene['process_definitions'] = [process_defn]
j_in = {
'scene' : scene,
'requests' : [log_likelihood_request]
}
j_out = jsonctmctree.interface.process_json_in(j_in)
ll = j_out['responses'][0]
c = -ll + unpacking_cost
slope = (c - cost) / delta
gradient.append(slope)
gradient.extend([-x for x in edge_gradient])
gradient = np.array(gradient)
# Return cost and gradient.
return cost, gradient
def main():
name_to_node = {
'tamarin' : 0,
'macaque' : 1,
'orangutan' : 2,
'chimpanzee' : 3,
'gorilla' : 4}
paralog_to_variable = {
'ecp' : 0,
'edn' : 1}
nodes = []
variables = []
rows = []
with open('paralogs.fasta') as fin:
while True:
line = fin.readline().strip().lower()
if not line:
break
name = line[1:-3]
paralog = line[-3:]
seq = fin.readline().strip()
row = ['ACGT'.index(x) for x in seq]
nodes.append(name_to_node[name])
variables.append(paralog_to_variable[paralog])
rows.append(row)
columns = [list(x) for x in zip(*rows)]
distn = [0.25, 0.25, 0.25, 0.25]
rates = [1, 1, 1, 1, 1, 1, 1, 1]
kappa = 2.0
tau = 3.0
process_defn, root_prior = get_process_defn_and_prior(distn, kappa, tau)
scene = {
"node_count" : 9,
"process_count" : 1,
"state_space_shape" : [4, 4],
"tree" : {
"row_nodes" : [5, 5, 6, 6, 7, 7, 8, 8],
"column_nodes" : [0, 6, 1, 7, 2, 8, 3, 4],
"edge_rate_scaling_factors" : rates,
"edge_processes" : [0, 0, 0, 0, 0, 0, 0, 0]
},
"root_prior" : root_prior,
"process_definition" : process_defn,
"observed_data" : {
"nodes" : nodes,
"variables" : variables,
"iid_observations" : columns
}
}
X = pack(distn, kappa, tau, rates)
f = functools.partial(objective_and_gradient, scene)
result = minimize(f, X, jac=True, method='L-BFGS-B')
print('final value of objective function:', result.fun)
distn, kappa, tau, rates, unpacking_cost = unpack(result.x)
print('nucleotide distribution:')
for nt, p in zip('ACGT', distn):
print(' ', nt, ':', p)
print('kappa:', kappa)
print('tau:', tau)
print('edge rate scaling factors:')
for r in rates:
print(' ', r)
main()
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | final value of objective function: 1721.78345463
nucleotide distribution:
A : 0.282900066429
C : 0.255266243854
G : 0.207346445744
T : 0.254487243973
kappa: 2.11388819482
tau: 1.82041945074
edge rate scaling factors:
0.102973912525
0.0706137070586
0.0511441290183
0.00878881519788
0.0298626104962
0.0109179537121
0.00455550645537
0.00501134613073
|