#!/usr/bin/env perl

use warnings;
use strict;

use Getopt::Long;

################################################################################
# Arguments - Start
my ($help, $name, $data_file, $info_file, $refseq_file, $source);

GetOptions(
	'help' => \$help,
	'name=s' => \$name,
	'data_file=s' => \$data_file,
	'info_file=s' => \$info_file,
	'refseq_file=s' => \$refseq_file,
	'source:s' => \$source
);

sub usage {
	print qq~
	Ensembl and UCSC assign EntrezGene IDs to transcripts
	based on sequence similarity, irrespective of whether
	the gene and the transcript are at (roughly) the same
	location. This results in assignments which are wrong.
	In order to sort out which EntrezGene IDs go with which
	Ensembl/UCSC IDs, we cross-reference with RefSeq.
	The same thing can crop up with EPConDB...

	Last updated: 21-Apr-08
	Author: James Allen (james.allen\@cimr.cam.ac.uk)

	Please provide
	--name
	--data_file
	--info_file
	--refseq_file
	[--source]

	[--help]
	~;
	print "\n";
	exit;
}

usage if defined $help;
usage unless $name && $data_file && $info_file && $refseq_file;
$source = 0 unless $source;

my $new_info_file = "$info_file.tmp";
# Arguments - End
################################################################################

################################################################################
# Process Info File - Start
my %gene_ids;

open(INFO_FILE, $info_file) || die "Cannot open file $info_file.";
open(NEW_INFO_FILE, ">$new_info_file") || die "Cannot open file $new_info_file.";

my ($line, $native_id_first);

if ($source eq 'epcondb' || $source eq 'ucsc') {
	# Native ID is first, then EntrezGene ID; there's no header row.
	$line = "";
	$native_id_first = 1;
} else {
	# The first line has column names, so we throw that away, once
	# we've worked out the order of the columns.
	$line = <INFO_FILE>;
	$native_id_first = 1;
	if ($line =~ /^EntrezGene ID/) {
		$native_id_first = 0;
	}
}

print NEW_INFO_FILE $line;

while (my $line = <INFO_FILE>) {
	$line =~ /^([^\t]*)\t([^\t^\n]*)$/;
	if ($1 and $2) {
		my ($native_id, $entrezgene_id);
		# We use a hash of hashes so that we get unique values.
		if ($native_id_first) {
			$native_id = $1;
			$entrezgene_id = $2;
		} else {
			$native_id = $2;
			$entrezgene_id = $1;
		}
		if ($source eq 'ensembl' || $source eq 'ucsc') {
			$native_id =~ s/\.\d+//;
		}
		$gene_ids{$native_id}{$entrezgene_id} = 1;
	}
}

close(INFO_FILE);
# Process Info File - End
################################################################################

################################################################################
# Process RefSeq File - Start
my %refseq_data;

open(REFSEQ_FILE, $refseq_file) || die "Cannot open file $refseq_file.";
while (my $line = <REFSEQ_FILE>) {
	my ($chr, $source, $method, $start, $stop,
		$score, $strand, $frame, $atts) = split(/\t/, $line);

	# We'll only ever have one gene id per row...
	my ($entrezgene_id) = $atts =~ /EntrezGene (\d+)/;

	if (exists $refseq_data{$entrezgene_id}) {
		if ($refseq_data{$entrezgene_id}{"chr"} eq $chr &&
			$refseq_data{$entrezgene_id}{"strand"} eq $strand)
		{
			$refseq_data{$entrezgene_id}{"min_start"} = $start if $start < $refseq_data{$entrezgene_id}{"min_start"};
			$refseq_data{$entrezgene_id}{"max_stop"} = $stop if $stop > $refseq_data{$entrezgene_id}{"max_stop"};
		} else {
			unless ($refseq_data{$entrezgene_id}{"chr"} eq "chrX" && $chr eq "chrY") {
				die "Mismatched chr or strand for id $entrezgene_id";
			}
		}
	} else {
		$refseq_data{$entrezgene_id}{"chr"} = $chr;
		$refseq_data{$entrezgene_id}{"strand"} = $strand;
		$refseq_data{$entrezgene_id}{"min_start"} = $start;
		$refseq_data{$entrezgene_id}{"max_stop"} = $stop;
	}
}

close(REFSEQ_FILE);
# Process RefSeq File - End
################################################################################

################################################################################
# Create New Info File - Start
# Iterate over the Ensembl/UCSC data; lookup the EntrezGene ID
# via the Transcript ID; then lookup the RefSeq position via
# the EntrezGene ID, and compare to the Ensembl/UCSC position
# If the two positions are sufficiently close (100k), then
# write out to a shiny new version of the ID mapping file
my %seen;
my $tmp_data_file;

if ($source eq 'epcondb') {
	($tmp_data_file = $data_file) =~ s/\*/ALL/g;
	`cat $data_file > $tmp_data_file`;
	open(DATA_FILE, $tmp_data_file) || die "Cannot open file $tmp_data_file";
} else {
	open(DATA_FILE, $data_file) || die "Cannot open file $data_file";
}

while (my $line = <DATA_FILE>) {
	my ($bin, $id, $chr, $strand, $transcript_start, $transcript_stop);

	if ($source eq 'epcondb') {
		($chr, $transcript_start, $transcript_stop, $strand, $id) =
			$line =~ /^([^\t]*)\t[^\t]*\t[^\t]*\t([^\t]*)\t([^\t]*)\t[^\t]*\t([^\t]*)\t[^\t]*\t([^\t\n]*)$/;
		next unless $id;
	} elsif ($source eq 'ensembl') {
		($bin, $id, $chr, $strand, $transcript_start, $transcript_stop) =
			$line =~ /^([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)/;
		$id =~ s/\.\d+//;
	} else {
		($id, $chr, $strand, $transcript_start, $transcript_stop) =
			$line =~ /^([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)/;
		$id =~ s/\.\d+//;
	}

	foreach my $entrezgene_id (keys %{$gene_ids{$id}}) {
		if (exists $refseq_data{$entrezgene_id}) {
			if ($refseq_data{$entrezgene_id}{"chr"} eq $chr &&
				$refseq_data{$entrezgene_id}{"strand"} eq $strand &&
				$refseq_data{$entrezgene_id}{"min_start"} < ($transcript_start+100000) &&
				$refseq_data{$entrezgene_id}{"max_stop"} > ($transcript_stop-100000))
			{
				unless (exists $seen{"$id$entrezgene_id"}) {
					if ($native_id_first) {
						print NEW_INFO_FILE "$id\t$entrezgene_id\n";
					} else {
						print NEW_INFO_FILE "$entrezgene_id\t$id\n";
					}
					$seen{"$id$entrezgene_id"} = 1;
				}
			}
		} else { 
			unless (exists $seen{"$id$entrezgene_id"}) {
				if ($native_id_first) {
					print NEW_INFO_FILE "$id\t$entrezgene_id\n";
				} else {
					print NEW_INFO_FILE "$entrezgene_id\t$id\n";
				}
				$seen{"$id$entrezgene_id"} = 1;
			}
		}
	}
}

close(DATA_FILE);
close(NEW_INFO_FILE);

if ($source eq 'epcondb') {
	unlink "$tmp_data_file";
}
`mv $new_info_file $info_file`;
# Create New Info File - End
################################################################################