root / Am1.0 / script / gene / make_orf.py @ 52e54a06
History | View | Annotate | Download (1.8 kB)
1 | 8c368a17 | Daofeng Li | |
---|---|---|---|
2 | 8c368a17 | Daofeng Li | gene={} |
3 | 8c368a17 | Daofeng Li | with open('transcripts.gtf') as fin: |
4 | 8c368a17 | Daofeng Li | for line in fin: |
5 | 8c368a17 | Daofeng Li | lst=line.rstrip().split('\t')
|
6 | 8c368a17 | Daofeng Li | if lst[2]=='transcript': |
7 | 8c368a17 | Daofeng Li | gene[lst[8].split('; ')[1].split()[1].replace('"','')]=[lst[0],int(lst[3]),int(lst[4]),lst[6],[],[]] |
8 | 8c368a17 | Daofeng Li | |
9 | 8c368a17 | Daofeng Li | |
10 | 8c368a17 | Daofeng Li | with open('best_candidates.gff3') as fin: |
11 | 8c368a17 | Daofeng Li | for line in fin: |
12 | 8c368a17 | Daofeng Li | lst=line.rstrip().split('\t')
|
13 | 8c368a17 | Daofeng Li | if len(lst)<9: |
14 | 8c368a17 | Daofeng Li | continue
|
15 | 8c368a17 | Daofeng Li | if lst[2]=='CDS': |
16 | 8c368a17 | Daofeng Li | n=gene[lst[0]]
|
17 | 8c368a17 | Daofeng Li | n[4].append(n[1]+int(lst[3])-2) |
18 | 8c368a17 | Daofeng Li | n[5].append(n[1]+int(lst[4])-1) |
19 | 8c368a17 | Daofeng Li | |
20 | 8c368a17 | Daofeng Li | |
21 | 8c368a17 | Daofeng Li | fout=open('orf','w') |
22 | 8c368a17 | Daofeng Li | fout2=open('orf.struct','w') |
23 | 8c368a17 | Daofeng Li | id=1
|
24 | 8c368a17 | Daofeng Li | for n in gene: |
25 | 8c368a17 | Daofeng Li | if len(gene[n][4])==0: continue |
26 | 8c368a17 | Daofeng Li | fout.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n'.format(gene[n][0], |
27 | 8c368a17 | Daofeng Li | gene[n][1],
|
28 | 8c368a17 | Daofeng Li | gene[n][2],
|
29 | 8c368a17 | Daofeng Li | n, |
30 | 8c368a17 | Daofeng Li | id,
|
31 | 8c368a17 | Daofeng Li | gene[n][3]))
|
32 | 8c368a17 | Daofeng Li | start=max(gene[n][4]) |
33 | 8c368a17 | Daofeng Li | stop=min(gene[n][5]) |
34 | 8c368a17 | Daofeng Li | fout2.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\n'.format(
|
35 | 8c368a17 | Daofeng Li | id,
|
36 | 8c368a17 | Daofeng Li | gene[n][0],
|
37 | 8c368a17 | Daofeng Li | gene[n][3],
|
38 | 8c368a17 | Daofeng Li | start, |
39 | 8c368a17 | Daofeng Li | stop, |
40 | 8c368a17 | Daofeng Li | start, |
41 | 8c368a17 | Daofeng Li | stop, |
42 | 8c368a17 | Daofeng Li | len(gene[n][4]), |
43 | 8c368a17 | Daofeng Li | ','.join([str(i) for i in gene[n][4]]), |
44 | 8c368a17 | Daofeng Li | ','.join([str(i) for i in gene[n][5]]), |
45 | 8c368a17 | Daofeng Li | n)) |
46 | 8c368a17 | Daofeng Li | id+=1 |
47 | 8c368a17 | Daofeng Li | fout.close() |
48 | 8c368a17 | Daofeng Li | fout2.close() |
49 | 8c368a17 | Daofeng Li | |
50 | 8c368a17 | Daofeng Li | |
51 | 8c368a17 | Daofeng Li | import os |
52 | 8c368a17 | Daofeng Li | os.system('sort -k1,1 -k2,2n orf > xx')
|
53 | 8c368a17 | Daofeng Li | os.system('mv xx orf')
|
54 | 8c368a17 | Daofeng Li | os.system('bgzip orf')
|
55 | 8c368a17 | Daofeng Li | os.system('tabix -p bed orf.gz')
|
56 | 8c368a17 | Daofeng Li | |
57 | 8c368a17 | Daofeng Li | |
58 | 8c368a17 | Daofeng Li | print ''' |
59 | 8c368a17 | Daofeng Li | drop table if exists orfstruct;
|
60 | 8c368a17 | Daofeng Li | create table orfstruct (
|
61 | 8c368a17 | Daofeng Li | id int unsigned not null primary key,
|
62 | 8c368a17 | Daofeng Li | chrom varchar(255) not null,
|
63 | 8c368a17 | Daofeng Li | strand char(1) not null,
|
64 | 8c368a17 | Daofeng Li | txStart int unsigned not null,
|
65 | 8c368a17 | Daofeng Li | txEnd int unsigned not null,
|
66 | 8c368a17 | Daofeng Li | cdsStart int unsigned not null,
|
67 | 8c368a17 | Daofeng Li | cdsEnd int unsigned not null,
|
68 | 8c368a17 | Daofeng Li | exonCount int unsigned not null,
|
69 | 8c368a17 | Daofeng Li | exonStarts text not null,
|
70 | 8c368a17 | Daofeng Li | exonEnds text not null,
|
71 | 8c368a17 | Daofeng Li | name varchar(255) not null
|
72 | 8c368a17 | Daofeng Li | );
|
73 | 8c368a17 | Daofeng Li | load data local infile 'orf.struct' into table orfstruct;
|
74 | 8c368a17 | Daofeng Li | |
75 | 8c368a17 | Daofeng Li | |
76 | 8c368a17 | Daofeng Li | drop table if exists orfsymbol;
|
77 | 8c368a17 | Daofeng Li | create table orfsymbol (
|
78 | 8c368a17 | Daofeng Li | name varchar(255) not null,
|
79 | 8c368a17 | Daofeng Li | symbol varchar(255) null,
|
80 | 8c368a17 | Daofeng Li | description text null,
|
81 | 8c368a17 | Daofeng Li | id int unsigned not null primary key,
|
82 | 8c368a17 | Daofeng Li | index(name)
|
83 | 8c368a17 | Daofeng Li | );
|
84 | 8c368a17 | Daofeng Li | '''
|