将比对好的fasta序列转换成relaxed phylip格式

2012/02/231 6,631

有时,在推断进化树的过程中,我们希望phylip格式的文件能够将序列的名称完整得保留下来。通常情况下,我们的名称会超过10字符, 而ClustalX和MUSCLE等软件给出的phylip文件是严格的phylip格式,物种名称不能超过10个字符。 Clustal和MUSCLE还给出了FASTA格式,这种格式则完整得保留了各序列的名称.

为了从比对好的FASTA文件直接生成 relaxed phylip格式的文件, 这里提供了R的源代码. 以供有兴趣的同仁使用,并提出宝贵意见.

[code lang="R"]
### Convert Aligned Fasta to relax phylip

getsequence.fasta <- function (x = NULL)
{
if (!inherits(x, "fasta")) {
stop("Make sure the data is a fasta object.")
}
if (is.null(x)) {
stop("You have to specify the input data.")
}
result = x[!grepl(">", x)]
return(result)
}

getnames.fasta <- function (x = NULL)
{
if (!inherits(x, "fasta")) {
stop("Make sure the data is a fasta object.")
}
if (is.null(x)) {
stop("You have to specify the input data.")
}
result = x[grepl(">", x)]
result = gsub(">", "", result)
return(result)
}

dat2relax.phylip <- function (input, write = TRUE)
{
row1.1 <- nrow(input)
row1.2 <- nchar(as.character(input[1, 2]))
row1 <- paste(row1.1, row1.2)
space <- as.vector(max(nchar(as.character(input[,1]))) +
1 - nchar(as.character(input[, 1])))
res <- c()
for(i in 1:length(space)){
res[i] <- paste(input[i, 1],
paste(rep(" ", space[i]), collapse = ""),
input[i, 2], collapse = "")
}
res <- c(row1, res)
return(res)
}

read.fasta <- function (file = NULL)
{
if (is.null(file)) {
stop("Please specify the input fasta file.")
}
result <- readLines(file)
nameline <- result[grepl("[>]", result)]
test <- regexpr(">", nameline) > 1
if (any(test)) {
warning(paste("\">\" in line(s)", which(test),
"\n appeared not at the beginning.
\n Please remove any character(s) before \">\"."))
}
result = result[grepl("[A-Za-z0-9]", result)]
result <- ConvFas(result, "fas")
if (any(regexpr(">", result[seq(1, length(result), by = 2)]) <
0)) {
xx <- 2 * which(regexpr(">", result[seq(1, length(result),
by = 2)]) < 0)
if (length(xx) > 10) {
xx <- xx[1:10]
stop(paste("readfasta could not find \">\" in row: \n",
paste(xx, collapse = ", "), "... \n", "Make sure the file ",
file, " is in fasta format.\n"))
}
}
class(result) <- "fasta"
return(result)
}

ConvFas <- function (fil = NULL, type = c("fas", "nxs", "phy"))
{
match.arg(type)
dna = fil
if (type == "fas") {
seqNamPos = grep("^>", dna)
pos = c(seqNamPos, length(dna) + 1)
seqNam = dna[seqNamPos]
}
if (type == "nxs") {
dna = dna[(grep("matrix", dna, ignore.case = TRUE) +
1):length(dna)]
dna = dna[-which(dna == "" | dna == "end;" | dna == ";")]
seqNam = unique(substr(dna, 1, regexpr(" ", dna) - 1))
}
if (type == "phy") {
dna = dna[regexpr("[ATGC-]", dna) > 0]
seqNam = substr(dna, 1, regexpr(" ", dna) - 1)
seqNam = seqNam[-which(seqNam == "")]
}
nSeq = length(seqNam)
for (i in 1:nSeq) {
if (type == "fas") {
st = pos[i] + 1
ed = pos[i + 1] - 1
stri = gsub(" ", "", paste(dna[st:ed], collapse = ""))
}
if (type == "nxs" | type == "phy") {
nBlock = length(dna)/length(seqNam)
rNam = ((1:nBlock) - 1) * nSeq + i
stri = gsub("[ -]", "", gsub(seqNam[i], "", paste(dna[rNam],
collapse = "")))
stri = toupper(stri)
}
Nam = paste(">", seqNam[i], sep = "")
if (i == 1) {
DNA = c(Nam, stri)
}
if (i > 1) {
DNA = c(DNA, Nam, stri)
}
}
DNA = gsub(">>", ">", DNA)
class(DNA) <- "fasta"
return(DNA)
}

dat <- read.fasta("input.fasta")
nam <- getnames.fasta(dat)
seq <- getsequence.fasta(dat)
dat2 <- data.frame(nam, seq)
res <- dat2relax.phylip(dat2)
writeLines(res, "output.phy")
[/code]

来自：http://blog.sciencenet.cn/blog-255662-529962.html

发表评论