【python练习题二】正则表达式

Riven · 发表于 2019-12-6 16:21:49

本帖最后由 Riven 于 2019-12-6 16:26 编辑

test1.

#!/usr/bin/env python
# -*- coding=utf-8 -*-
import os
import gzip
import re
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--infile', required=True, help="the inpute file path")
parser.add_argument('-o', '--outfile', help="the output file path", default="gene.xls")
args = vars(parser.parse_args())
inf = os.path.abspath(args['infile'])
outf = os.path.abspath(args['outfile'])
def safe_open(file, mode):
if file.endswith(".gz"):

复制代码

return gzip.open(file, mode)
else:
return open(file, mode)
def extract_gene(str):
pattern = re.compile('GENE=(\w+);')
gene_name = pattern.search(str)
# print(gene_name)
return gene_name.group(1)
with safe_open(inf, 'rb') as infile, open(outf, 'w') as outfile:
for line in infile:
if line.startswith('#'):
continue
else:
g_name = extract_gene(line)
outfile.write(g_name + '\n')

复制代码

Riven · 发表于 2019-12-6 16:22:26

本帖最后由 Riven 于 2019-12-6 16:25 编辑

test2.
#!/usr/bin/env python
# -*- coding=utf-8 -*-
import os
import gzip
import re
import argparse
from Bio import SeqIO
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--ref', help="the reference file path, default:human b37, *.fasta", default="/PUBLIC/database/HUMAN/genome/Human/human_g1k_v37_decoy.fasta")
parser.add_argument('-c', '--chr', help="specify the chromosome to search", required=True)
parser.add_argument('-b', '--bases', help="the group of bases to search", required=True)
parser.add_argument('-o', '--outfile', help="the output file path", default="search_out.xls")
args = vars(parser.parse_args())
print(args)
ref = os.path.abspath(args['ref'].strip())
chr = args['chr'].strip()
base_group = args['bases'].strip()
out = os.path.abspath(args['outfile'].strip())
def get_base_group_info(ref, chrom, residue, file):
with open(file, 'w') as out:
recod_dict = SeqIO.to_dict(SeqIO.parse(ref, "fasta"))
chr_seq = str(recod_dict[chrom].seq)
match = re.finditer(residue, chr_seq)
for i in match:
tup = i.span()
start_pos = int(tup[0]) + 1
end_pos = tup[1]
# out.write(chrom + ':' + str(start_pos) + '-' + end_pos + '\n')
out.write(chrom + ':' + str(start_pos) + '-' + str(end_pos) + '\n')
if __name__ == "__main__":
get_base_group_info(ref, chr, base_group, out)

复制代码

Riven · 发表于 2019-12-6 16:22:55

本帖最后由 Riven 于 2019-12-6 16:25 编辑

test3.
#!/usr/bin/env python
# -*- coding=utf-8 -*-
import re
import os
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--infile', help="infile path", required=True)
parser.add_argument('-o', '--outfile', help="outfile path", default="out_share.xls")
args = vars(parser.parse_args())
infile = os.path.abspath(args['infile'].strip())
outfile = os.path.abspath(args['outfile'].strip())
def is_variation(string):
if string == "0/0" or string == "./.":
return False
else:
return True
var_sp_lst =[]
with open(infile, 'r') as inf, open(outfile, 'w') as out:
idx_lst = []
header_lst = []
for line in inf:
if line.startswith('Priority'):
header_lst = line.strip().split('\t')
out.write(line.strip('\n') + '\t' + 'Variation_sample' + '\n')
match_obj = re.search(r'FORMAT(.+)Ori_REF', line)
sample_lst = match_obj.group(1).strip('\t').split('\t')
for i in sample_lst:
sp_idx = header_lst.index(i)
idx_lst.append(sp_idx)
else:
lst = []
content_lst = line.strip().split('\t')
for j in idx_lst:
variation_info = content_lst[j].split(':')[0]
if is_variation(variation_info):
lst.append(header_lst[j])
else:
continue
# print(lst)
if len(lst)==0:
out.write(line.strip('\n') + '\t' + '.' + '\n')
else:
out.write(line.strip('\n') + '\t' + ','.join(lst) + '\n')

复制代码

helili · 发表于 2019-12-6 16:26:48

本帖最后由 helili 于 2019-12-6 16:28 编辑

#!usr/bin/env python
#-*- coding=utf-8 -*-

import os
import re
import sys

infile = sys.argv[1]
outfile = sys.argv[2]

def safe_open(file,mode):
file = os.path.abspath(file)
if not os.path.exists(file):
      exit("%s is not exists" % file)
if file.endswith('.gz'):
      import gzip
      return gzip.open(file,mode)
else:
      return open(file,mode)

pattern = re.compile(r'GENE=(.+?);.+')
dic = {}
with safe_open(infile,"r") as f, open(outfile,"w") as out:
for line in f:
      line = line.strip()
      if line.startswith('#'):
         out.write(line)
         out.write('\n')
      else:
         site = line.split('\t')[0:7]
         info = line.split('\t')[7]
         gene = pattern.findall(info)[0]
         outline = '{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(*site)
         out.write(outline)
         out.write(gene+"\n")
f.close()

chenming · 发表于 2019-12-6 17:10:36

https://gitee.com/biocoder/pytho ... aster/re/re_test.py re模块一些需要注意的点

【python练习题二】正则表达式

练习题1

发表回复