adna-workflow/preseq_process.py at master · DReichLab/adna-workflow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# Process the output of preseq to estimate the number of (additional) reads required for a library
#

from __future__ import print_function
import argparse

# preseq projects the number of unique reads given a number of reads, given a histogram of reads at a particular point
# We analyze four different quantities
# 1. raw_reads (reads demultiplexing for index-barcode key)
# 2. reads hitting a 1240k autosome target
# 3. unique reads hitting a 1240k autosome target
# 4. number of distinct 1240k targets hit

# 1 is assumed to be related to 2 by a constant ratio
# preseq computes the function between 2 and 3
# 4 is computed from 3 using an empirical model
def preseq_analysis(reads_hitting_any_target, unique_reads, number_raw_reads, total_reads_hitting_any_target_actual, unique_targets_hit, minimum_raw_reads, empiricalTargetEstimator):
	length = len(reads_hitting_any_target)
	if length != len(unique_reads):
		raise ValueError('length mismatch between entries in reads_hitting_any_target and unique_reads')

	inverse_e = 0.3678794
	tenth = 0.1

	raw_reads_inverse_e = float('inf')
	raw_reads_tenth = float('inf')

	# not all demultiplexing reads merge, align, or map to targets
	# use a simple ratio to translate between reads for preseq and demultiplexing reads
	read_ratio = 0
	if total_reads_hitting_any_target_actual > 0:
		read_ratio = float(number_raw_reads) / total_reads_hitting_any_target_actual
	#print(read_ratio)

	raw_reads = [x * read_ratio for x in reads_hitting_any_target]
	estimated_targets = [empiricalTargetEstimator.unique_reads_to_empirical_targets(x) for x in unique_reads]

	try:
		raw_reads_inverse_e, ignored = find_xy_for_slope(raw_reads, estimated_targets, inverse_e)
		raw_reads_tenth, ignored = find_xy_for_slope(raw_reads, estimated_targets, tenth)
	except:
		pass

	# ratio of unique targets to raw reads
	unique_target_thresholds = ['0.01', '0.0075', '0.005']

	values = {
		'number_raw_reads': number_raw_reads,
		'preseq_unique_targets_hit': unique_targets_hit,
		'preseq_raw_reads_inverse_e': raw_reads_inverse_e,
		'preseq_raw_reads_tenth': raw_reads_tenth,
	}
	# fill out values so something is always returned
	for threshold in unique_target_thresholds:
		raw_reads_threshold = 0
		expected_unique_targets_at_threshold = -1
		try:
			raw_reads_threshold, expected_unique_targets_at_threshold = find_xy_for_slope(raw_reads, estimated_targets, float(threshold))
		except:
			pass
		total_reads_required = max(raw_reads_threshold, minimum_raw_reads)
		additional_reads_required = max(total_reads_required - number_raw_reads, 0)

		# if we are already at 70% of raw reads goal, then stop
		if (additional_reads_required / total_reads_required) < 0.3:
			additional_reads_required = 0

		unique_reads_hitting_any_target_at_threshold = 0
		try:
			unique_reads_hitting_any_target_at_threshold = interpolate(raw_reads, unique_reads, raw_reads_threshold)
		except:
			pass

		values['preseq_total_reads_required_' + threshold] = total_reads_required
		values['preseq_additional_reads_required_' + threshold] = additional_reads_required
		values['preseq_expected_unique_targets_at_threshold_' + threshold] = expected_unique_targets_at_threshold
		values['preseq_target_coverage_at_threshold_' + threshold] = unique_reads_hitting_any_target_at_threshold / empiricalTargetEstimator.total_autosome_targets

		if len(raw_reads) > 0 and raw_reads_threshold == raw_reads[-1]: # outside of preseq projections
			values['preseq_total_reads_required_' + threshold] = '>{:.0f}'.format(values['preseq_total_reads_required_' + threshold])
			values['preseq_additional_reads_required_' + threshold] = '>{:.0f}'.format(values['preseq_additional_reads_required_' + threshold])

	# marginal uniqueness at current number of reads
	try:
		expected_distinct_reads, marginal_uniqueness = interpolate_base(reads_hitting_any_target, unique_reads, total_reads_hitting_any_target_actual)
		# the marginal uniqueness slope is not available for the last inteval
		# take the nearest valid interval as the estimate
		if marginal_uniqueness is None:
			ignored, marginal_uniqueness = interpolate_base(reads_hitting_any_target, unique_reads, reads_hitting_any_target[len(reads_hitting_any_target)-2])

		values['preseq_marginal_uniqueness'] = marginal_uniqueness
	except:
		pass
	# marginal uniqueness threshold
	marginal_uniqueness_thresholds = ['0.368', '0.10']
	for threshold in marginal_uniqueness_thresholds:
		reads_hitting_any_target_at_threshold = 0
		unique_reads_hitting_any_target_at_threshold = 0
		try:
			reads_hitting_any_target_at_threshold, unique_reads_hitting_any_target_at_threshold = find_xy_for_slope(reads_hitting_any_target, unique_reads, float(threshold))
		except:
			pass

		coverage = unique_reads_hitting_any_target_at_threshold / empiricalTargetEstimator.total_autosome_targets
		values['preseq_coverage_at_marginal_uniqueness_' + threshold] = coverage if (len(reads_hitting_any_target) > 0) and (reads_hitting_any_target_at_threshold < reads_hitting_any_target[-1]) else ('>{:.3f}'.format(coverage))

	return values

# assuming decreasing slope, find x, y such that the slope is the slope_threshold
# if slope is already less than threshold, report first value
def find_xy_for_slope(X, Y, slope_threshold):
	if len(X) != len(Y):
		raise ValueError('length mismatch')
	if len(X) < 2:
		raise ValueError('no slope can be computed')
	length = len(Y)
	slopes = [float('inf')]
	for i in range(1,length):
		slope = (Y[i] - Y[i-1]) / (X[i] - X[i-1])
		slopes.append(slope)

	i = len(slopes)-1
	while (i > 0) and (slope_threshold > slopes[i]):
		i -= 1
	# edge cases
	if i == 0:
		return X[0], Y[0]
	elif i == len(slopes)-1:
		return X[i], Y[i]

	slope_change = slopes[i+1] - slopes[i]
	x_change = X[i+1] - X[i]
	x_at_threshold = X[i] + (slope_threshold - slopes[i]) / slope_change * x_change
	# this is not exact: we are mixing a piecewise linear model with acceleration
	# just use the threshold value
	y_at_threshold = Y[i] + slope_threshold * (x_at_threshold - X[i])

	return x_at_threshold, y_at_threshold

def interpolate(X, Y, x):
	y, slope = interpolate_base(X, Y, x)
	return y

# return interpolated value of y at x and slope
# slope is defined on interval [ x_i, x_{i+1} )
def interpolate_base(X, Y, x):
	if len(X) != len(Y):
		raise ValueError('length mismatch')
	if len(X) == 0:
		raise ValueError('no data to interpolate')
	# find position where x[i] <= x < x[i+1]
	i = len(X)-1
	while (i > 0) and (x < X[i]):
		i -= 1

	if i == len(X)-1:
		if x > X[i]:
			return '>{:.1f}'.format(Y[i]), None
		else:
			return Y[i], None

	slope = (Y[i+1] - Y[i]) / (X[i+1] - X[i])
	x_change = x - X[i]
	y = Y[i] + slope * x_change
	return y, slope

# count number of reads overlapping any target and number of unique targets hit
# This histogram is not the input to preseq. It counts the number of reads
# for each target, which allows counting the actual number of targets hit
def total_and_unique_target_hits(target_histogram_filename):
	total_hits = 0
	unique_targets = 0
	try:
		with open(target_histogram_filename) as histogram:
			for line in histogram:
				target_count, occurrences = map(lambda x: int(x), line.split())
				total_hits += target_count * occurrences
				unique_targets += occurrences
	except:
		pass
	return total_hits, unique_targets

# defines the empirical relationship between the unique reads output from preseq and the number of unique 1240k targets hit
# empirical targets = 1 / (a + b * (unique reads) ^ power)
class EmpiricalTargetEstimator:
	total_autosome_targets = 1150639 # autosome targets in the 1240k target set
	a = 1
	b = 1
	power = -1;

	def __init__(self, a, b, power):
		self.a = a
		self.b = b
		self.power = power

	# historical empirical relationship between unique reads hitting targets and total targets covered
	def unique_reads_to_empirical_targets(self, unique_reads):
		estimated_hit_fraction = max(float(unique_reads) / self.total_autosome_targets, 1e-9)
		corrected_hit_faction = 1 / (self.a + self.b * (estimated_hit_fraction ** self.power))
		corrected_hit_estimate = corrected_hit_faction * self.total_autosome_targets
		# the number of unique targets is constrained by the number of unique reads
		corrected_hit_estimate = min(unique_reads, corrected_hit_estimate)
		return corrected_hit_estimate

def read_preseq_file(filename):
	reads_hitting_any_target = []
	unique_reads = []
	try:
		with open(filename) as preseq_file:
			# skip preseq header
			preseq_file.readline()

			# read in table of reads
			for line in preseq_file:
				reads, expected_distinct_reads, lower, upper = line.split()
				reads_hitting_any_target.append(float(reads))
				unique_reads.append(float(expected_distinct_reads))
	except:
		pass
	return reads_hitting_any_target, unique_reads

if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	# number of raw demultiplexing reads that produced the histogram processed by preseq
	parser.add_argument("-n", "--number_raw_reads", help="number of raw reads demultiplexing", type=int, required=True)
	# minimum raw reads required for a library
	parser.add_argument("-m", "--minimum_raw_reads", help="minimum required number of raw reads demultiplexing", type=int, default=3e6)
	parser.add_argument("-r", "--total_reads_hitting_any_target_actual", help="Reads hitting any autosome target, used for computing read ratio", type=int, required=True)
	#parser.add_argument("-t", "--expected_targets_per_raw_read_threshold", help="threshold for ratio of expected unique targets hit to raw demultiplexing reads", type=float, default=0.01)

	# empirical model parameters
	parser.add_argument("-a", "--modelParameterA", help="parameter for empirical model of unique reads to total targets covered", type=float, default=1)
	parser.add_argument("-b", "--modelParameterB", help="parameter for empirical model of unique reads to total targets covered", type=float, default=1)
	parser.add_argument("-p", "--modelParameterPower", help="exponent parameter for empirical model of unique reads to total targets covered", type=float, default=-1)

	parser.add_argument("target_histogram_filename", help="Target histogram to count number of actual target hits. This is not the input to preseq.")
	parser.add_argument("preseq_filename", help="preseq file containing mapping of reads hitting targets to unique reads hitting targets. Name this using the index-barcode key.")
	parser.add_argument("-k", "--key", help="Key for aggregating statistics. Normally this should be index-barcode key.", required=True)
	args = parser.parse_args()

	empiricalTargetEstimator = EmpiricalTargetEstimator(args.modelParameterA, args.modelParameterB, args.modelParameterPower)

	# empirically measured parameters to extrapolate from
	target_hits_ignored, unique_targets_hit = total_and_unique_target_hits(args.target_histogram_filename)

	# read preseq table from file
	reads_hitting_any_target, unique_reads = read_preseq_file(args.preseq_filename)

	values = preseq_analysis(reads_hitting_any_target, unique_reads, args.number_raw_reads, args.total_reads_hitting_any_target_actual, unique_targets_hit, args.minimum_raw_reads, empiricalTargetEstimator)

	# output
	print(args.key, end='\t')
	for key in values:
		print(key, end='\t')
		if str(values[key]).startswith('>'):
			print('{}'.format(values[key]), end='\t')
		elif 'coverage' in key or 'preseq_marginal_uniqueness' in key:
			print('{:.5f}'.format(values[key]), end='\t')
		else:
			print('{:.0f}'.format(values[key]), end='\t')