<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article
  PUBLIC "-//NLM//DTD Journal Publishing DTD v3.0 20080202//EN" "http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="3.0" xml:lang="EN">
  <front>
    <journal-meta><journal-id journal-id-type="publisher-id">plos</journal-id><journal-id journal-id-type="nlm-ta">PLoS Comput Biol</journal-id><journal-id journal-id-type="pmc">ploscomp</journal-id><!--===== Grouping journal title elements =====--><journal-title-group><journal-title>PLoS Computational Biology</journal-title></journal-title-group><issn pub-type="ppub">1553-734X</issn><issn pub-type="epub">1553-7358</issn><publisher>
        <publisher-name>Public Library of Science</publisher-name>
        <publisher-loc>San Francisco, USA</publisher-loc>
      </publisher></journal-meta>
    <article-meta><article-id pub-id-type="publisher-id">PCOMPBIOL-D-11-01265</article-id><article-id pub-id-type="doi">10.1371/journal.pcbi.1002529</article-id><article-categories>
        <subj-group subj-group-type="heading">
          <subject>Research Article</subject>
        </subj-group>
        <subj-group subj-group-type="Discipline-v2">
          <subject>Biology</subject>
          <subj-group>
            <subject>Computational biology</subject>
          </subj-group>
          <subj-group>
            <subject>Genetics</subject>
          </subj-group>
          <subj-group>
            <subject>Genomics</subject>
          </subj-group>
        </subj-group>
        <subj-group subj-group-type="Discipline-v2">
          <subject>Mathematics</subject>
          <subj-group>
            <subject>Applied mathematics</subject>
          </subj-group>
          <subj-group>
            <subject>Probability theory</subject>
          </subj-group>
          <subj-group>
            <subject>Statistics</subject>
          </subj-group>
        </subj-group>
        <subj-group subj-group-type="Discipline">
          <subject>Genetics and Genomics</subject>
          <subject>Computational Biology</subject>
          <subject>Mathematics</subject>
        </subj-group>
      </article-categories><title-group><article-title>Exploring Massive, Genome Scale Datasets with the GenometriCorr Package</article-title><alt-title alt-title-type="running-head">Correlation across Genomewide Data</alt-title></title-group><contrib-group>
        <contrib contrib-type="author" equal-contrib="yes" xlink:type="simple">
          <name name-style="western">
            <surname>Favorov</surname>
            <given-names>Alexander</given-names>
          </name>
          <xref ref-type="aff" rid="aff1">
            <sup>1</sup>
          </xref>
          <xref ref-type="aff" rid="aff2">
            <sup>2</sup>
          </xref>
          <xref ref-type="aff" rid="aff3">
            <sup>3</sup>
          </xref>
          <xref ref-type="corresp" rid="cor1">
            <sup>*</sup>
          </xref>
        </contrib>
        <contrib contrib-type="author" equal-contrib="yes" xlink:type="simple">
          <name name-style="western">
            <surname>Mularoni</surname>
            <given-names>Loris</given-names>
          </name>
          <xref ref-type="aff" rid="aff1">
            <sup>1</sup>
          </xref>
          <xref ref-type="fn" rid="fn1">
            <sup>¤a</sup>
          </xref>
        </contrib>
        <contrib contrib-type="author" xlink:type="simple">
          <name name-style="western">
            <surname>Cope</surname>
            <given-names>Leslie M.</given-names>
          </name>
          <xref ref-type="aff" rid="aff1">
            <sup>1</sup>
          </xref>
        </contrib>
        <contrib contrib-type="author" xlink:type="simple">
          <name name-style="western">
            <surname>Medvedeva</surname>
            <given-names>Yulia</given-names>
          </name>
          <xref ref-type="aff" rid="aff2">
            <sup>2</sup>
          </xref>
          <xref ref-type="aff" rid="aff3">
            <sup>3</sup>
          </xref>
          <xref ref-type="fn" rid="fn2">
            <sup>¤b</sup>
          </xref>
        </contrib>
        <contrib contrib-type="author" xlink:type="simple">
          <name name-style="western">
            <surname>Mironov</surname>
            <given-names>Andrey A.</given-names>
          </name>
          <xref ref-type="aff" rid="aff4">
            <sup>4</sup>
          </xref>
          <xref ref-type="aff" rid="aff5">
            <sup>5</sup>
          </xref>
        </contrib>
        <contrib contrib-type="author" xlink:type="simple">
          <name name-style="western">
            <surname>Makeev</surname>
            <given-names>Vsevolod J.</given-names>
          </name>
          <xref ref-type="aff" rid="aff2">
            <sup>2</sup>
          </xref>
          <xref ref-type="aff" rid="aff3">
            <sup>3</sup>
          </xref>
        </contrib>
        <contrib contrib-type="author" xlink:type="simple">
          <name name-style="western">
            <surname>Wheelan</surname>
            <given-names>Sarah J.</given-names>
          </name>
          <xref ref-type="aff" rid="aff1">
            <sup>1</sup>
          </xref>
          <xref ref-type="corresp" rid="cor1">
            <sup>*</sup>
          </xref>
        </contrib>
      </contrib-group><aff id="aff1"><label>1</label><addr-line>Department of Oncology, Division of Biostatistics and Bioinformatics, Johns Hopkins University School of Medicine, Baltimore, Maryland, United States of America</addr-line>       </aff><aff id="aff2"><label>2</label><addr-line>Vavilov Institute of General Genetics, Russian Academy of Sciences, Moscow, Russia</addr-line>       </aff><aff id="aff3"><label>3</label><addr-line>Research Institute of Genetics and Selection of Industrial Microorganisms, Moscow, Russia</addr-line>       </aff><aff id="aff4"><label>4</label><addr-line>Department of Bioengineering and Bioinformatics, Moscow State University, Moscow, Russia</addr-line>       </aff><aff id="aff5"><label>5</label><addr-line>Institute for Information Transmission Problems, Russian Academy of Sciences, Moscow, Russia</addr-line>       </aff><contrib-group>
        <contrib contrib-type="editor" xlink:type="simple">
          <name name-style="western">
            <surname>Lapp</surname>
            <given-names>Hilmar</given-names>
          </name>
          <role>Editor</role>
          <xref ref-type="aff" rid="edit1"/>
        </contrib>
      </contrib-group><aff id="edit1">National Evolutionary Synthesis Center, United States of America</aff><author-notes>
        <corresp id="cor1">* E-mail: <email xlink:type="simple">favorov@sensi.org</email> (AF); <email xlink:type="simple">swheelan@jhmi.edu</email> (SJW)</corresp>
        <fn fn-type="current-aff" id="fn1">
          <label>¤a</label>
          <p>Current address: Institut d'Investigacions Biomèdiques August Pi i Sunyer (IDIBAPS), Barcelona, Spain.</p>
        </fn>
        <fn fn-type="current-aff" id="fn2">
          <label>¤b</label>
          <p>Current address: Computational Bioscience Research Center, King Abdullah University of Science and Technology, Thuwal, Saudi Arabia.</p>
        </fn>
        <fn fn-type="con">
          <p>Conceived and designed the experiments: AF LM LMC YM AAM VJM SJW. Performed the experiments: AF LM SJW. Analyzed the data: AF LM SJW. Contributed reagents/materials/analysis tools: AF LM LMC SJW. Wrote the paper: LMC SJW.</p>
        </fn>
      <fn fn-type="conflict">
        <p>The authors have declared that no competing interests exist.</p>
      </fn></author-notes><pub-date pub-type="collection">
        <month>5</month>
        <year>2012</year>
      </pub-date><pub-date pub-type="epub">
        <day>31</day>
        <month>5</month>
        <year>2012</year>
      </pub-date><volume>8</volume><issue>5</issue><elocation-id>e1002529</elocation-id><history>
        <date date-type="received">
          <day>24</day>
          <month>8</month>
          <year>2011</year>
        </date>
        <date date-type="accepted">
          <day>8</day>
          <month>4</month>
          <year>2012</year>
        </date>
      </history><!--===== Grouping copyright info into permissions =====--><permissions><copyright-year>2012</copyright-year><copyright-holder>Favorov et al</copyright-holder><license><license-p>This is an open-access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p></license></permissions><abstract>
        <p>We have created a statistically grounded tool for determining the correlation of genomewide data with other datasets or known biological features, intended to guide biological exploration of high-dimensional datasets, rather than providing immediate answers. The software enables several biologically motivated approaches to these data and here we describe the rationale and implementation for each approach. Our models and statistics are implemented in an R package that efficiently calculates the spatial correlation between two sets of genomic intervals (data and/or annotated features), for use as a metric of functional interaction. The software handles any type of pointwise or interval data and instead of running analyses with predefined metrics, it computes the significance and direction of several types of spatial association; this is intended to suggest potentially relevant relationships between the datasets.</p>
        <p>Availability and implementation: The package, GenometriCorr, can be freely downloaded at <ext-link ext-link-type="uri" xlink:href="http://genometricorr.sourceforge.net/" xlink:type="simple">http://genometricorr.sourceforge.net/</ext-link>. Installation guidelines and examples are available from the sourceforge repository. The package is pending submission to Bioconductor.</p>
      </abstract><funding-group><funding-statement>The work was supported by Russian Foundation for Basic Research (grant 11-04-02016-a to AF, 09-04-92742 to AM), by the Presidium of Russian Academy of Science Program in Molecular and Cell Biology (VJM, AM) and in Basic Science for Medicine (AM), by “Russian Ministry of Science and Education State Contract (07.514.11.4005)” (VJM), by National Center for Research Resources grant (UL2RR025005 and supplement, UL1RR025005) (SJW), by the Johns Hopkins University Framework for the Future (SJW), and by the Commonwealth Foundation (SJW). The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</funding-statement></funding-group><counts>
        <page-count count="12"/>
      </counts></article-meta>
  </front>
  <body>
    <sec id="s1">
      <title/>
      <disp-quote>
        <p>This is a <italic>PLoS Computational Biology </italic>Software Article</p>
      </disp-quote>
    </sec>
    <sec id="s2">
      <title>Introduction</title>
      <p>Manual exploration of high-dimensional whole-genome datasets is possible, to a limited extent, with newer, high-capacity genome browsers. While biologists can browse their data and can often suggest relevant hypotheses for statistical testing, fully informed and thorough data exploration is impossible to do by eye.</p>
      <p>A common theme in biological experiments is that the nucleotide-level proximity of a set of genomic regions (points or intervals) to genome annotations or to other experimentally derived data sets (such as coverage peaks, mutation locations, and breakpoints) is a useful proxy for a functionally relevant or otherwise interesting interaction. For example, the well established overlap of CpG islands with the promoter regions of genes <xref ref-type="bibr" rid="pcbi.1002529-Bird1">[1]</xref> is critically related to the gene-silencing mechanism of DNA hypermethylation.</p>
      <p>While using spatial proximity to infer functional relationships is a valid approach in many cases, this is not necessary for functional interaction, as chromatin is flexible and many activating and repressive marks act at a distance <xref ref-type="bibr" rid="pcbi.1002529-Giles1">[2]</xref>, so ideally any software that attempts to automatically uncover important relationships should be sensitive to these interactions as well. Others have given thought to examining some of the interactions that we will discuss, (for a review see Bickel et al, 2009 <xref ref-type="bibr" rid="pcbi.1002529-Bickel1">[3]</xref>); however, the only software for performing such analyses focuses on overlapping features <xref ref-type="bibr" rid="pcbi.1002529-Bickel2">[4]</xref>.</p>
      <p>Here we present a method for identifying whether two sets of intervals are spatially correlated across a genome, detected as a deviation from a nonuniform distribution of one set of intervals with respect to the other. This is not a trivial task, computationally or conceptually. Many different spatial rearrangements are possible, each with different biological implications and each requiring specialized statistical analyses. The software performs all analyses on each input, so that a variety of biologically significant relationships are queried. This includes looking for proximity, looking for uniform spacing, looking for increased or decreased overlaps of intervals or points, and presenting the data in a way that a biologist can understand. Results from each test are provided for each chromosome from the dataset and for the entirety of the dataset, which in most cases is the entire genome. As we have found that asking “is A related to B” is completely different from asking “is B related to A,” we encourage users to not only perform all comparisons but to perform them in both orientations.</p>
    </sec>
    <sec id="s3">
      <title>Design and Implementation</title>
      <sec id="s3a">
        <title>Overview and general considerations</title>
        <p>GenometriCorr is written in R (using S4 classes) and makes use of the Bioconductor <xref ref-type="bibr" rid="pcbi.1002529-Gentleman1">[5]</xref> packages IRanges and GRanges to create sets of intervals that are then compared. The input data can be in a variety of commonly used biological data formats, and the core functions work from a configuration file that sets parameters in a straightforward, easy to edit format that can be archived to ensure reproducibility. We provide a Tk interface so that non-programmers may access the functions via straightforward menus, and we also provide a Galaxy <xref ref-type="bibr" rid="pcbi.1002529-Gangadharan1">[6]</xref> plugin that runs the analysis in an environment widely used by biologists, in which the results may be explored more thoroughly. Finally, we provide two auxiliary methods for output, so that graphical results can be obtained in addition to the statistics produced by the main function. The configuration and the result, GenometriCorrConfig and GenometriCorrResult are designed such that once a configuration file has been read, the software proceeds with a simple run.config call.</p>
        <p>The main function, GenometriCorrelation, implements various statistical approaches to assess whether the positions of two sets of intervals are associated in genomic space. As stated above, almost all of the tests are asymmetrical, in that one of the two interval sets is considered to be a reference, fixed in the genome, while the other set, the query, is evaluated statistically with respect to the fixed reference. The results can be very different if the reference and query sets are swapped, as shown below. In essence, each of the tests is designed to evaluate whether the spatial distribution of the query intervals is independent of the positions of the reference intervals, and each test is sensitive to a different aspect of known biological relationships.</p>
        <p>Two types of graphical output are produced. Calling the graphics.plot() function produces a straightforward statistical summary and ECDF plot for the relative and absolute distances for each chromosome and/or the entire genome. Summary results for all chromosomes together are displayed in a window and results for the individual chromosomes are written to a PDF. The visualize() function produces a more elaborate and biologist-friendly color-coded density plot, intended to represent areas of high and low absolute and relative distance correlation; again, summary results appear in a window and chromosome-by-chromosome results are written to a PDF. The two types of output are shown in <xref ref-type="fig" rid="pcbi-1002529-g001">Figure 1</xref>, along with detailed descriptions of the features of each. These data are Hermes transposon insertions in the yeast genome; this transposon generally inserts into nucleosome free regions <xref ref-type="bibr" rid="pcbi.1002529-Blankenberg1">[7]</xref>.</p>
        <fig id="pcbi-1002529-g001" position="float">
          <object-id pub-id-type="doi">10.1371/journal.pcbi.1002529.g001</object-id>
          <label>Figure 1</label>
          <caption>
            <title>Two types of graphic output are available.</title>
            <p>(A) A statistical summary and ECDF plots. (B) A graphical interpretation of the spatial relationships. The query features are depicted along the plot according to their distance to a reference feature; the colors indicate deviation from the expected distribution while the overlay line indicates the density of the data at each absolute or relative distance. The data density mirrors but is independent from the log-odds colors; at small distances in the absolute distance plot the data density is higher than expected but this represents a very small percentage of the total query points.</p>
          </caption>
          <graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.g001" xlink:type="simple"/>
        </fig>
        <p>An important consideration here is that two sets of genomic features may only be correlated in one direction. As an example, we found all NF-kappa-B (NFKB1) sites using a simple exact string search of the human genome and correlated their positions to the positions of all RefSeq gene <xref ref-type="bibr" rid="pcbi.1002529-Pruitt1">[8]</xref> start sites. NF-kappa-B is a family of transcription factors critical in many processes, including immunity, inflammation, and cell growth <xref ref-type="bibr" rid="pcbi.1002529-Oeckinghaus1">[9]</xref>.</p>
        <p>As there are nearly five times as many transcription start sites as potential NF-kappa-B sites, most transcription start sites will not be near a NF-kappa-B site even if they are perfectly correlated, while the NF-kappa-B sites will nearly all be close to transcription start sites. <xref ref-type="fig" rid="pcbi-1002529-g002">Figure 2a</xref> depicts the excellent correlation between human NF-kappa-B sites to transcription start sites (same distribution, perfect absolute distance correlation), and <xref ref-type="fig" rid="pcbi-1002529-g002">Figure 2b</xref> depicts the poor (and not statistically significant) correlation of transcription start sites to NF-kappa-B sites (absolute distance indicates a separation, K-S not significant). As this level of asymmetry is common, if not expected, in biological datasets, we recommend performing all comparisons in both directions, using each dataset as the fixed set in turn. While the relevant comparison is not known a priori, the results of the two comparisons will clearly indicate whether the relationship is asymmetric.</p>
        <fig id="pcbi-1002529-g002" position="float">
          <object-id pub-id-type="doi">10.1371/journal.pcbi.1002529.g002</object-id>
          <label>Figure 2</label>
          <caption>
            <title>NFkappaB sites vs human RefSeq promoter start sites.</title>
            <p>Query and reference colors as in <xref ref-type="fig" rid="pcbi-1002529-g001">Figure 1</xref>. (A) NFkappaB as the query gives a significant Kolmogorov-Smirnov association and anticorrelation that is visible from the graph, in absolute distances. (B) Correlation in the reverse direction suggests no significant relationship between the two classes of sites.</p>
          </caption>
          <graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.g002" xlink:type="simple"/>
        </fig>
        <p>Many of the tests we used are performed on pointwise representations of the intervals rather than on the intervals themselves. When the input includes intervals, the midpoints of these intervals are used for those analyses. Very large intervals may relate to genomic features in different ways, depending on whether we examine their start points, stop points, both boundaries, or just a point in the middle. Rather than trying to address this ambiguity or to randomly guess at what the user hopes to do, we expect the user to specify the points when the exact point is important, and we use the midpoint when the user inputs an interval.</p>
        <p>We have developed and tested four useful and relevant metrics, which will be discussed below: the relative distance test, the absolute distance test, the Jaccard test, and the projection test, intended to measure a variety of biologically relevant correlations. In <xref ref-type="fig" rid="pcbi-1002529-g003">Figure 3</xref> we summarize the metrics and their uses, and in subsequent figures we demonstrate the utility of each type of test, using both published and novel observations. Each figure shows both a standard histogram representation of the relationships between the query and reference sets, in addition to the results and p-values generated by our software. As a strong correlation between the query and reference may involve just a subset of a very large number of points, a histogram of the absolute or relative distances can be uninformative, while the tests performed by the software are sensitive to true correlations within large and overall not strongly correlated datasets. All p-values cited are computed by the GenometriCorr functions. For each test we describe a published dataset for which the test is particularly useful. <xref ref-type="table" rid="pcbi-1002529-t001">Table 1</xref> and <xref ref-type="table" rid="pcbi-1002529-t002">table 2</xref> summarize the results.</p>
        <fig id="pcbi-1002529-g003" position="float">
          <object-id pub-id-type="doi">10.1371/journal.pcbi.1002529.g003</object-id>
          <label>Figure 3</label>
          <caption>
            <title>A schematic of the various tests implemented in the software package, showing when certain tests are most useful.</title>
            <p>(A) depicts the intervals created in silico and (B) shows how the query distances are evaluated within the intervals. (C) depicts a random distribution of query versus reference intervals; here the observed and expected distances for both the absolute and relative tests are the same. In (D) we show a relationship best uncovered by the absolute distance test; useful especially for small genomes, this test determines whether the query and reference are often separated by a fixed distance. In (E), the query points are consistently far away from the reference points, so the relative distance test will be significant, while the absolute distances are not significant in this case. Interestingly, the query intervals are variable enough in size that even though the query and reference points are usually separated, the absolute distances between them vary widely in size, including some fairly small distances. (F) demonstrates the projection test, which evaluates whether pointwise data falls consistently inside or outside of a set of intervals. Finally, in (G) we see the Jaccard test, which looks for significant overlaps between datasets by evaluating the ratio of the intersection of the datasets (dark grey) to the union of the datasets (light grey). Perfect correlation will give a ratio of 1, and perfect anticorrelation will result in a ratio of zero.</p>
          </caption>
          <graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.g003" xlink:type="simple"/>
        </fig>
        <table-wrap id="pcbi-1002529-t001" position="float"><object-id pub-id-type="doi">10.1371/journal.pcbi.1002529.t001</object-id><label>Table 1</label><caption>
            <title>Summary of all correlations performed in these experiments.</title>
          </caption><!--===== Grouping alternate versions of objects =====--><alternatives><graphic id="pcbi-1002529-t001-1" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.t001" xlink:type="simple"/><table>
            <colgroup span="1">
              <col align="left" span="1"/>
              <col align="center" span="1"/>
              <col align="center" span="1"/>
              <col align="center" span="1"/>
              <col align="center" span="1"/>
              <col align="center" span="1"/>
              <col align="center" span="1"/>
            </colgroup>
            <thead>
              <tr>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">Direction</td>
                <td align="left" colspan="1" rowspan="1">Relative Kolmogorov-Smirnov p–value</td>
                <td align="left" colspan="1" rowspan="1">Relative ECDF area correlation</td>
                <td align="left" colspan="3" rowspan="1">Relative ECDF deviation area p–value</td>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">C</td>
                <td align="left" colspan="1" rowspan="1">F</td>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td align="left" colspan="1" rowspan="1">Human transcription start sites (T) versus NF-kappa-B sites (N)</td>
                <td align="left" colspan="1" rowspan="1">T→N</td>
                <td align="left" colspan="1" rowspan="1">2e−07</td>
                <td align="left" colspan="1" rowspan="1">0.015</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">N→T</td>
                <td align="left" colspan="1" rowspan="1">0.13</td>
                <td align="left" colspan="1" rowspan="1">0.012</td>
                <td align="left" colspan="1" rowspan="1">0.072</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1">L1 elements(T) versus Splice Sites (S)</td>
                <td align="left" colspan="1" rowspan="1">T→S</td>
                <td align="left" colspan="1" rowspan="1">∼0<xref ref-type="table-fn" rid="nt101">*</xref></td>
                <td align="left" colspan="1" rowspan="1">−0.03</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">S→T</td>
                <td align="left" colspan="1" rowspan="1">∼0</td>
                <td align="left" colspan="1" rowspan="1">−0.16</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1">Promoter sites (P) versus Promoter plus spikein (S)</td>
                <td align="left" colspan="1" rowspan="1">P→S</td>
                <td align="left" colspan="1" rowspan="1">∼0</td>
                <td align="left" colspan="1" rowspan="1">0.25</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">S→P</td>
                <td align="left" colspan="1" rowspan="1">∼0</td>
                <td align="left" colspan="1" rowspan="1">0.25</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1">H3K4me3 histones (H) versus Promoters of actively transcribed genes (P)</td>
                <td align="left" colspan="1" rowspan="1">H→P</td>
                <td align="left" colspan="1" rowspan="1">∼0</td>
                <td align="left" colspan="1" rowspan="1">0.22</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">P→H</td>
                <td align="left" colspan="1" rowspan="1">∼0</td>
                <td align="left" colspan="1" rowspan="1">0.02</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1">CpG Islands (I) versus Coding sequences (C)</td>
                <td align="left" colspan="1" rowspan="1">I→C</td>
                <td align="left" colspan="1" rowspan="1">∼0</td>
                <td align="left" colspan="1" rowspan="1">−0.195</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">C→I</td>
                <td align="left" colspan="1" rowspan="1">∼0</td>
                <td align="left" colspan="1" rowspan="1">−0.012</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
              </tr>
            </tbody>
          </table></alternatives><table-wrap-foot>
            <fn id="nt101">
              <label>*</label>
              <p>∼0 means that the default R precision for KS test p-value is not enough to distinguish the value from 0.</p>
            </fn>
          </table-wrap-foot></table-wrap>
        <table-wrap id="pcbi-1002529-t002" position="float"><object-id pub-id-type="doi">10.1371/journal.pcbi.1002529.t002</object-id><label>Table 2</label><caption>
            <title>Continued summary of correlations performed in the experiments described.</title>
          </caption><!--===== Grouping alternate versions of objects =====--><alternatives><graphic id="pcbi-1002529-t002-2" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.t002" xlink:type="simple"/><table>
            <colgroup span="1">
              <col align="left" span="1"/>
              <col align="center" span="1"/>
              <col align="center" span="1"/>
              <col align="center" span="1"/>
              <col align="center" span="1"/>
              <col align="center" span="1"/>
              <col align="center" span="1"/>
              <col align="center" span="1"/>
              <col align="center" span="1"/>
              <col align="center" span="1"/>
              <col align="center" span="1"/>
            </colgroup>
            <thead>
              <tr>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">
                  <italic>Direction</italic>
                </td>
                <td align="left" colspan="3" rowspan="1">Scaled Absolute minimal distance <italic>p–value</italic></td>
                <td align="left" colspan="3" rowspan="1">Jaccard Measure <italic>p–value</italic></td>
                <td align="left" colspan="3" rowspan="1">Projection test <italic>p-value</italic></td>
              </tr>
              <tr>
                <td align="left" colspan="3" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">C</td>
                <td align="left" colspan="1" rowspan="1">F</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">O</td>
                <td align="left" colspan="1" rowspan="1">N</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">O</td>
                <td align="left" colspan="1" rowspan="1">N</td>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td align="left" colspan="1" rowspan="1">Human transcription start sites (T) versus NF-kappa-B sites (N)</td>
                <td align="left" colspan="1" rowspan="1">T→N</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1">0.0091</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">N→T</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1">∼0<xref ref-type="table-fn" rid="nt102">*</xref></td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1">L1 elements(T) versus Splice Sites (S)</td>
                <td align="left" colspan="1" rowspan="1">T→S</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1">∼0</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">S→T</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1">∼0</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1">Promoter sites (P) versus Promoter plus spikein (S)</td>
                <td align="left" colspan="1" rowspan="1">P→S</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1">∼0</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">S→P</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1">∼0</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1">H3K4me3 histones (H) versus Promoters of actively transcribed genes (P)</td>
                <td align="left" colspan="1" rowspan="1">H→P</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">∼0</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">P→H</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">∼0</td>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1"/>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1">CpG Islands (I) versus Coding sequences (C)</td>
                <td align="left" colspan="1" rowspan="1">I→C</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1">∼0</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
              </tr>
              <tr>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">C→I</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1">&lt;0.001</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
                <td align="left" colspan="1" rowspan="1">∼0</td>
                <td align="left" colspan="1" rowspan="1"/>
                <td align="left" colspan="1" rowspan="1">√</td>
              </tr>
            </tbody>
          </table></alternatives><table-wrap-foot>
            <fn id="nt102">
              <label>*</label>
              <p>∼0 means that the default R precision for the binomial test is not enough to distinguish the value from 0.</p>
            </fn>
            <fn id="nt103">
              <label/>
              <p>P-values are shown for all tests in both directions (using one dataset as the query and the other as the reference, then reversing). For each test, we have indicated whether the relationship between the datasets is positive or negative; for the relative and absolute distance tests this is written as “close (C)” vs “far (F)” and for the Jaccard and projection tests it is written as “overlapping (O)” vs “nonoverlapping (N).”</p>
            </fn>
          </table-wrap-foot></table-wrap>
      </sec>
      <sec id="s3b">
        <title>Relative distance test</title>
        <p>The relative distance test measures whether two sets of positions are closer together or further apart than expected, where the exact distances are not as important as the relative relationship. For example, a recent publication <xref ref-type="bibr" rid="pcbi.1002529-Zhang1">[10]</xref> reported that transposable elements found in genes tend not to be located near splice sites. We tested this association with the GenometriCorr software and found that, first, the two entities do not overlap (the Jaccard and projection tests, summarized in <xref ref-type="fig" rid="pcbi-1002529-g003">figure 3</xref> and described in detail later, are both significant and in the lower tail) and that both the relative and absolute distance tests show a correlation. Upon closer examination, the transposable element and splice site positions are actually negatively correlated in terms of relative distance; that is, the two types of genomic features tend not to co-occur but are consistently spaced apart, though not by a uniform distance (<xref ref-type="fig" rid="pcbi-1002529-g004">Figure 4</xref>). Results are shown for Alu elements but the relationship holds true for both L1 and Alu elements, in agreement with the reported trends.</p>
        <fig id="pcbi-1002529-g004" position="float">
          <object-id pub-id-type="doi">10.1371/journal.pcbi.1002529.g004</object-id>
          <label>Figure 4</label>
          <caption>
            <title>Alu elements vs splice sites in the graphics.plot() output (A) and in the visualize() output (B).</title>
            <p>Alu elements are consistently located at a variable but always nonzero distance from splice sites. Query and reference colors as in <xref ref-type="fig" rid="pcbi-1002529-g001">Figure 1</xref>.</p>
          </caption>
          <graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.g004" xlink:type="simple"/>
        </fig>
        <p>The idea behind the relative distance test is that if the query locations are independent of the references, then the relative position of each query point, with respect to the adjacent reference points, will have a uniform distribution. Thus, the null distribution for relative distance test as formulated here is simply a straight line at y = 0.5. For this test, intervals are represented as points, located at the midpoint of the interval, so that the size of intervals and overlap between query and reference are not included in the analysis.</p>
        <p>For each query point, <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e001" xlink:type="simple"/></inline-formula>, we identify the flanking reference points, <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e002" xlink:type="simple"/></inline-formula> and <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e003" xlink:type="simple"/></inline-formula>, and calculate the relative distance <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e004" xlink:type="simple"/></inline-formula> = (<inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e005" xlink:type="simple"/></inline-formula>,−<inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e006" xlink:type="simple"/></inline-formula>)/(<inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e007" xlink:type="simple"/></inline-formula>,−<inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e008" xlink:type="simple"/></inline-formula>), comparing this to a uniform null distribution. More formally, the “relative distance” <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e009" xlink:type="simple"/></inline-formula> for the <italic>i</italic>-th query point is:<disp-formula><graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e010" xlink:type="simple"/></disp-formula>and under the null, the <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e011" xlink:type="simple"/></inline-formula>'s would be distributed uniformly in [0, 1/2].</p>
        <p>Two different tests are available to evaluate the uniformity of the distribution the <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e012" xlink:type="simple"/></inline-formula>'s. The first and simplest is the standard Kolmogorov-Smirnov test, assessing the maximum difference between CDFs.</p>
        <p>A permutation-based test is carried out as well, to more comprehensively compare the two cumulative distribution functions using the area of the region in which they differ as the test statistic. Here <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e013" xlink:type="simple"/></inline-formula> (<italic>Empirical Distribution Cumulative Function</italic>) designates the observed distribution of relative distances <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e014" xlink:type="simple"/></inline-formula>, while <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e015" xlink:type="simple"/></inline-formula> describes the expected distribution under the uniform, null distribution, which is a straight line. The area between the <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e016" xlink:type="simple"/></inline-formula> and <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e017" xlink:type="simple"/></inline-formula> is then calculated as:<disp-formula><graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e018" xlink:type="simple"/></disp-formula>and a <italic>p-value</italic> for S obtained by permutation, in which we randomly draw <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e019" xlink:type="simple"/></inline-formula> (number of query points) values from the ideal uniform distribution of <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e020" xlink:type="simple"/></inline-formula> and calculate the area <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e021" xlink:type="simple"/></inline-formula>.</p>
        <p>The integral of the difference between the <italic>ECDF</italic> and the <italic>ECDF<sub>ideal</sub></italic> also permits us to derive a natural measure of association for the two sets of intervals. The sign of the integral indicates the direction of the correlation, so the positive sign indicates that <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e022" xlink:type="simple"/></inline-formula>'s tend to be low and thus the query intervals are attracted to reference intervals while, vice versa, the negative sign suggests that query intervals avoid reference intervals. With appropriate rescaling, we define a correlation-like measure:<disp-formula><graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e023" xlink:type="simple"/></disp-formula>to express this. The <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e024" xlink:type="simple"/></inline-formula> equals zero for independent query and reference; it equals 1 if each query point coincides with a query point and, finally, it equals <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e025" xlink:type="simple"/></inline-formula> if each query point falls in the middle of the reference gap.</p>
      </sec>
      <sec id="s3c">
        <title>Absolute distance test</title>
        <p>In some cases, particularly in small genomes in which reference points are closely spaced, the relative distance test produces misleading results. For example, if the promoters in a genome are generally found roughly 100–1500 bases apart (for example, yeast), an element that is positioned consistently 500 bp from promoters will look uncorrelated with promoters in relative terms, as it will sometimes be extremely close to a promoter that is not the one it is functionally related to, thereby diluting the distribution of query-reference distances with many incorrect data points. In these situations the absolute distance test is useful. We created a toy dataset for this analysis, first taking the positions of the start points of all human promoters (31083 sites), creating a new set of small intervals placed randomly from 10–10000 base pairs from each promoter start, and adding an additional 3000 small intervals randomly placed between 75 and 100 bp from a promoter. We then compared these intervals to the actual promoter intervals in the human genome, and the software uncovered the signal of the fixed distance points within the shifted points, whereas a simple histogram approach fails (<xref ref-type="fig" rid="pcbi-1002529-g005">Figure 5</xref>). The null distribution for the absolute distance test depends on the data. If the inter-reference intervals are somewhat randomly distributed, then the null distribution will be exponential, but if the inter-reference intervals are constrained somehow, the null distribution will have a very different shape.</p>
        <fig id="pcbi-1002529-g005" position="float">
          <object-id pub-id-type="doi">10.1371/journal.pcbi.1002529.g005</object-id>
          <label>Figure 5</label>
          <caption>
            <title>A toy example of absolute distance correlation.</title>
            <p>(A) Histograms of the observed and expected ranges of minimum distances between the reference and query. (B) GenometriCorr's simple plot for the same data. Query and reference colors as in <xref ref-type="fig" rid="pcbi-1002529-g001">Figure 1</xref>.</p>
          </caption>
          <graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.g005" xlink:type="simple"/>
        </fig>
        <p>As in the relative distance test, intervals are represented by their midpoints.</p>
        <p>The statistic used is very simple and intuitive. We suppose there are <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e026" xlink:type="simple"/></inline-formula> query points and <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e027" xlink:type="simple"/></inline-formula> reference points on a chromosome, and for each query point <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e028" xlink:type="simple"/></inline-formula>, find the distance to the closest reference point, scaled by the expected inter-reference gap for the chromosome<disp-formula><graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e029" xlink:type="simple"/></disp-formula>The final test statistic is the mean value of <italic>d<sub>i</sub></italic><disp-formula><graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e030" xlink:type="simple"/></disp-formula>which characterizes the spatial association between the query and the reference points. The lower it is, the closer together they tend to be.</p>
        <p>The <italic>p-value</italic> is obtained by permutation test. At each iteration, we draw <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e031" xlink:type="simple"/></inline-formula> simulated query points uniformly distributed along the chromosome and calculate the permuted statistic <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e032" xlink:type="simple"/></inline-formula>. The <italic>p-value</italic> is the proportion of permuted statistics that exceed the observed <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e033" xlink:type="simple"/></inline-formula>. As implemented in the package, the test is two-sided and returns both the <italic>p-value</italic> and the direction of the association.</p>
      </sec>
      <sec id="s3d">
        <title>Projection test</title>
        <p>Another test included in the software is the projection test. As seen in <xref ref-type="fig" rid="pcbi-1002529-g003">figure 3</xref>, this tests whether pointwise data overlap interval data in a significant way. To confirm the biological relevance of this test we examined data from the Roadmap Epigenomics Project <xref ref-type="bibr" rid="pcbi.1002529-Bernstein1">[11]</xref>. Using the RNAseq data and histones H3K27me3 and H3K4me3 ChIP data from UCSF-UBC (GEO accessions GSM484408 (RNAseq), GSM428295 (H3K27me3), and GSM410808/GSM432392, (replicates for H3K4me3)), we used the projection test to examine the relationship between the two histone marks and the promoters of the most highly expressed genes. The software was able to determine that the H3K4me3 marks significantly overlap the gene positions (<xref ref-type="fig" rid="pcbi-1002529-g006">Figure 6A</xref>) and the H3K27me3 marks are significantly underrepresented near active genes (<xref ref-type="fig" rid="pcbi-1002529-g006">Figure 6B</xref>). Note that in both cases the projection test is highly significant, but in opposite directions; for the H3K4me3 data the projection test is in the lower tail, indicating significant overlap, while the opposite is true for the H3K4me3 data, indicating very little overlap with promoters of active genes.</p>
        <fig id="pcbi-1002529-g006" position="float">
          <object-id pub-id-type="doi">10.1371/journal.pcbi.1002529.g006</object-id>
          <label>Figure 6</label>
          <caption>
            <title>Promoter positions from highly expressed genes (as given from mRNAseq data) and histone ChIP data recently available from the Roadmap Epigenomics Project <xref ref-type="bibr" rid="pcbi.1002529-Pruitt1">[<bold>8</bold>]</xref>.</title>
            <p>(A) H3K4me3 versus highly expressed genes. (B) H3K27me3 versus highly expressed genes. Query and reference colors as in <xref ref-type="fig" rid="pcbi-1002529-g001">Figure 1</xref>.</p>
          </caption>
          <graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.g006" xlink:type="simple"/>
        </fig>
        <p>The projection test is the methodologically simpler of the two overlap tests in the package; the other, the Jaccard measure, is discussed below. For this test, query intervals are still represented as midpoints, but the reference should be a set of intervals. If the query points are independent of the reference, then the probability that any one query point is contained in a reference interval is the proportion of the chromosome covered by reference intervals:<disp-formula><graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e034" xlink:type="simple"/></disp-formula>The total number of query points <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e035" xlink:type="simple"/></inline-formula> contained in reference intervals has a binomial distribution,<disp-formula><graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e036" xlink:type="simple"/></disp-formula>A standard two-sided binomial test is used to evaluate statistical significance. The test is unlikely to be informative if the genomic coverage of the reference intervals is very close to 0 or 1. ; here the p-values will be extremely high.</p>
      </sec>
      <sec id="s3e">
        <title>Jaccard test</title>
        <p>The Jaccard test measures overlaps between two interval sets by measuring the extent of intersection between two interval sets, divided by the length of their union. Thus, two datasets that overlap perfectly have a union that is equal to their intersection, and the ratio is 1. This proves to be a very useful measure for biological data, as demonstrated in <xref ref-type="fig" rid="pcbi-1002529-g007">Figure 7</xref>, in which CpG islands <xref ref-type="bibr" rid="pcbi.1002529-Wu1">[12]</xref> are compared with coding sequences in the human genome. Comparing the CpG islands with the coding sequences we see that the two interval sets overlap much less than expected, given the amount of the genome that each occupies, and this anti-correlation is statistically significant. This is expected, as CpG islands generally occur in promoters and other non-genic regions.</p>
        <fig id="pcbi-1002529-g007" position="float">
          <object-id pub-id-type="doi">10.1371/journal.pcbi.1002529.g007</object-id>
          <label>Figure 7</label>
          <caption>
            <title>Human genomic CpG islands from Wu et al <xref ref-type="bibr" rid="pcbi.1002529-Oeckinghaus1">[<bold>9</bold>]</xref> correlated with the positions of coding sequences in the human genome.</title>
            <p>Query and reference colors as in <xref ref-type="fig" rid="pcbi-1002529-g001">Figure 1</xref>.</p>
          </caption>
          <graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.g007" xlink:type="simple"/>
        </fig>
        <p>The Jaccard statistic is calculated on intervals rather than points, and is the ratio of the number of nucleotides in the intersection of the reference and query, and the total number of nucleotides spanned by the reference and query together.</p>
        <p>More formally:</p>
        <p>The Jaccard statistic, <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e037" xlink:type="simple"/></inline-formula>, evaluates interval sets <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e038" xlink:type="simple"/></inline-formula> and <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e039" xlink:type="simple"/></inline-formula> that are sets of chromosome positions that are covered by query and reference intervals, respectively.</p>
        <p><inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e040" xlink:type="simple"/></inline-formula> where <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e041" xlink:type="simple"/></inline-formula> denotes the size of a set Y.</p>
        <p>The <italic>p-value</italic> and the direction of difference from the null hypothesis (that the positions of <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e042" xlink:type="simple"/></inline-formula> and <inline-formula><inline-graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1002529.e043" xlink:type="simple"/></inline-formula> are independent) are obtained by permutation. Each permutation randomizes the query intervals uniformly across the chromosome, maintaining the spacing between intervals.</p>
      </sec>
      <sec id="s3f">
        <title>Comparisons limited to genomic subsets</title>
        <p>An investigator may want to explore correlations within defined intervals rather than genomewide; for example, when looking at binding sites within and very close to genes, the correlation between these sites will be extremely high genomewide because they are constrained to small intervals, but upon examination of the sites within genes, there may be no correlation at all. For this reason we provide methods that restrict the correlations to intervals defined by the investigator and that can be set up from within the configuration file.</p>
      </sec>
    </sec>
    <sec id="s4">
      <title>Results</title>
      <p>We tested the software on our own high-dimensional data, a set of cloned insertion sites of an exogenously supplied Ty1 retrotransposon in the gene-rich yeast genome, for which we were trying to determine Ty1 targeting specificity. After sequencing and mapping it was clear that the insertions cluster near tRNA genes but do not generally insert into these genes, as seen in <xref ref-type="fig" rid="pcbi-1002529-g008">Figure 8A</xref>. <xref ref-type="fig" rid="pcbi-1002529-g008">Figure 8B</xref> displays the very complex relationship between Ty1 and tRNA promoters; the insertions occur at very specific points along nucleosome-bound DNA and follow a reproducible periodic pattern. On further examination we were able to map the insertion sites precisely to the nucleosome surface, as we have previously reported <xref ref-type="bibr" rid="pcbi.1002529-Mularoni1">[13]</xref>.</p>
      <fig id="pcbi-1002529-g008" position="float">
        <object-id pub-id-type="doi">10.1371/journal.pcbi.1002529.g008</object-id>
        <label>Figure 8</label>
        <caption>
          <title>Ty1 retrotransposon insertion sites vs tRNA genes in the yeast genome.</title>
          <p>(A) ECDF plots (B) Graphic display. Arrows mark Ty1 insertion sites at nucleosome-occupied positions near tRNA genes. Nucleosomes are in green. The colored graph contains several regions of high observed/expected Ty1 insertions (red colors), and the black line indicates a high density of Ty1 insertions, as well, in these regions. Relative to the tRNA position, the Ty1 insertion sites are most dense inside the nucleosome occupied regions. Query and reference colors as in <xref ref-type="fig" rid="pcbi-1002529-g001">Figure 1</xref>.</p>
        </caption>
        <graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.g008" xlink:type="simple"/>
      </fig>
      <p>The examples provided here illustrate the range of biological questions that can be addressed with our software. A particularly compelling feature of the package is that negative correlations (overlap or proximity much less than expected if the query and reference were unrelated) are reported, meaning that correlations between factors that act at a distance are detectable. Also, we observe that absolute and relative distances are both important and may measure different phenomena; thus the software provides appropriate tests for these correlations as well.</p>
      <p>We do not intend for our software to supplant the role of either the biologist or the statistician in a team of investigators working on high throughput sequencing or microarray data. Rather, by determining the statistical significance of genomewide interactions, the software serves as a hypothesis generator, enabling all investigators to begin validating associations that are likely to be real, much earlier than they would have otherwise.</p>
      <p>We do not provide a built-in method for retrieving query and/or reference features that may conform to a configuration suggested by the correlation methods. As we provide methods to use the software from within the Galaxy interface (below, and <xref ref-type="fig" rid="pcbi-1002529-g009">Figure 9</xref>), users with minimal computational experience can create any desired subsets using the many tools available in that environment. More computationally experienced users will have no trouble extracting query and reference intervals and overlapping these intervals as suggested by the correlation output.</p>
      <fig id="pcbi-1002529-g009" position="float">
        <object-id pub-id-type="doi">10.1371/journal.pcbi.1002529.g009</object-id>
        <label>Figure 9</label>
        <caption>
          <title>A) The Galaxy interface to GenometriCorr. B) The Tk interface to GenometriCorr.</title>
          <p>Instructions for using both are found on the website.</p>
        </caption>
        <graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1002529.g009" xlink:type="simple"/>
      </fig>
      <p>GenometriCorr can be customized for use with any genome and any type of point or interval data.</p>
    </sec>
    <sec id="s5">
      <title>Availability and Future Directions</title>
      <p>GenometriCorr is available, along with examples and installation guidelines, from <ext-link ext-link-type="uri" xlink:href="http://genometricorr.sourceforge.net/" xlink:type="simple">http://genometricorr.sourceforge.net/</ext-link>. The software is written in R and can be used from the R command line, through a Tk graphical interface, or through the Galaxy interface; all of these options are documented on the site.</p>
      <p>In future work we plan to implement the generalized Jaccard measure, which can handle sparsely distributed query and reference sets. Moreover, the generalized Jaccard measure can account for intervals that have a weight or other numerical value, in addition to coordinates. This weight can denote anything from multiplicity of coverage to experimental confidence.</p>
    </sec>
  </body>
  <back>
    <ref-list>
      <title>References</title>
      <ref id="pcbi.1002529-Bird1">
        <label>1</label>
        <element-citation publication-type="journal" xlink:type="simple">             <person-group person-group-type="author"><name name-style="western"><surname>Bird</surname><given-names>AP</given-names></name></person-group>             <year>1986</year>             <article-title>CpG-rich islands and the function of DNA methylation.</article-title>             <source>Nature</source>             <volume>321</volume>             <fpage>209</fpage>             <lpage>213</lpage>          </element-citation>
      </ref>
      <ref id="pcbi.1002529-Giles1">
        <label>2</label>
        <element-citation publication-type="journal" xlink:type="simple">             <person-group person-group-type="author"><name name-style="western"><surname>Giles</surname><given-names>KE</given-names></name><name name-style="western"><surname>Gowher</surname><given-names>H</given-names></name><name name-style="western"><surname>Ghirlando</surname><given-names>R</given-names></name><name name-style="western"><surname>Jin</surname><given-names>C</given-names></name><name name-style="western"><surname>Felsenfeld</surname><given-names>G</given-names></name></person-group>             <year>2010</year>             <article-title>Chromatin boundaries, insulators, and long-range interactions in the nucleus.</article-title>             <source>Cold Spring Harb Symp Quant Biol</source>             <volume>75</volume>             <fpage>79</fpage>             <lpage>85</lpage>          </element-citation>
      </ref>
      <ref id="pcbi.1002529-Bickel1">
        <label>3</label>
        <element-citation publication-type="journal" xlink:type="simple">             <person-group person-group-type="author"><name name-style="western"><surname>Bickel</surname><given-names>PJ</given-names></name><name name-style="western"><surname>Brown</surname><given-names>JB</given-names></name><name name-style="western"><surname>Huang</surname><given-names>H</given-names></name><name name-style="western"><surname>Li</surname><given-names>Q</given-names></name></person-group>             <year>2009</year>             <article-title>An overview of recent developments in genomics and associated statistical methods.</article-title>             <source>Philos Transact A Math Phys Eng Sci</source>             <volume>367</volume>             <fpage>4313</fpage>             <lpage>4337</lpage>          </element-citation>
      </ref>
      <ref id="pcbi.1002529-Bickel2">
        <label>4</label>
        <element-citation publication-type="journal" xlink:type="simple">             <person-group person-group-type="author"><name name-style="western"><surname>Bickel</surname><given-names>PJ</given-names></name><name name-style="western"><surname>Boley</surname><given-names>N</given-names></name><name name-style="western"><surname>Brown</surname><given-names>JB</given-names></name><name name-style="western"><surname>Huang</surname><given-names>H</given-names></name><name name-style="western"><surname>Zhang</surname><given-names>NR</given-names></name></person-group>             <year>2010</year>             <article-title>Subsampling methods for genomic inference.</article-title>             <source>Ann Appl Stat</source>             <volume>4</volume>             <fpage>1660</fpage>             <lpage>1660–1697</lpage>          </element-citation>
      </ref>
      <ref id="pcbi.1002529-Gentleman1">
        <label>5</label>
        <element-citation publication-type="journal" xlink:type="simple">             <person-group person-group-type="author"><name name-style="western"><surname>Gentleman</surname><given-names>RC</given-names></name><name name-style="western"><surname>Carey</surname><given-names>VJ</given-names></name><name name-style="western"><surname>Bates</surname><given-names>DM</given-names></name><name name-style="western"><surname>Bolstad</surname><given-names>B</given-names></name><name name-style="western"><surname>Dettling</surname><given-names>M</given-names></name><etal/></person-group>             <year>2004</year>             <article-title>Bioconductor: Open software development for computational biology and bioinformatics.</article-title>             <source>Genome Biol</source>             <volume>5</volume>             <fpage>R80</fpage>          </element-citation>
      </ref>
      <ref id="pcbi.1002529-Gangadharan1">
        <label>6</label>
        <element-citation publication-type="journal" xlink:type="simple">             <person-group person-group-type="author"><name name-style="western"><surname>Gangadharan</surname><given-names>S</given-names></name><name name-style="western"><surname>Mularoni</surname><given-names>L</given-names></name><name name-style="western"><surname>Fain-Thornton</surname><given-names>J</given-names></name><name name-style="western"><surname>Wheelan</surname><given-names>SJ</given-names></name><name name-style="western"><surname>Craig</surname><given-names>NL</given-names></name></person-group>             <year>2010</year>             <article-title>Inaugural article: DNA transposon hermes inserts into DNA in nucleosome-free regions in vivo.</article-title>             <source>Proc Natl Acad Sci U S A</source>             <volume>107</volume>             <fpage>21966</fpage>             <lpage>21972</lpage>          </element-citation>
      </ref>
      <ref id="pcbi.1002529-Blankenberg1">
        <label>7</label>
        <element-citation publication-type="journal" xlink:type="simple">             <person-group person-group-type="author"><name name-style="western"><surname>Blankenberg</surname><given-names>D</given-names></name><name name-style="western"><surname>Von Kuster</surname><given-names>G</given-names></name><name name-style="western"><surname>Coraor</surname><given-names>N</given-names></name><name name-style="western"><surname>Ananda</surname><given-names>G</given-names></name><name name-style="western"><surname>Lazarus</surname><given-names>R</given-names></name><etal/></person-group>             <year>2010</year>             <article-title>Galaxy: A web-based genome analysis tool for experimentalists.</article-title>             <source>Curr Protoc Mol Biol Chapter</source>             <volume>19</volume>             <fpage>Unit 19.10.1</fpage>             <lpage>21</lpage>          </element-citation>
      </ref>
      <ref id="pcbi.1002529-Pruitt1">
        <label>8</label>
        <element-citation publication-type="journal" xlink:type="simple">             <person-group person-group-type="author"><name name-style="western"><surname>Pruitt</surname><given-names>KD</given-names></name><name name-style="western"><surname>Tatusova</surname><given-names>T</given-names></name><name name-style="western"><surname>Maglott</surname><given-names>DR</given-names></name></person-group>             <year>2007</year>             <article-title>NCBI reference sequences (RefSeq): A curated non-redundant sequence database of genomes, transcripts and proteins.</article-title>             <source>Nucleic Acids Res</source>             <volume>35</volume>             <fpage>D61</fpage>             <lpage>5</lpage>          </element-citation>
      </ref>
      <ref id="pcbi.1002529-Oeckinghaus1">
        <label>9</label>
        <element-citation publication-type="journal" xlink:type="simple">             <person-group person-group-type="author"><name name-style="western"><surname>Oeckinghaus</surname><given-names>A</given-names></name><name name-style="western"><surname>Ghosh</surname><given-names>S</given-names></name></person-group>             <year>2009</year>             <article-title>The NF-kappaB family of transcription factors and its regulation.</article-title>             <source>Cold Spring Harb Perspect Biol</source>             <volume>1</volume>             <fpage>a000034</fpage>          </element-citation>
      </ref>
      <ref id="pcbi.1002529-Zhang1">
        <label>10</label>
        <element-citation publication-type="journal" xlink:type="simple">             <person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names></name><name name-style="western"><surname>Romanish</surname><given-names>MT</given-names></name><name name-style="western"><surname>Mager</surname><given-names>DL</given-names></name></person-group>             <year>2011</year>             <article-title>Distributions of transposable elements reveal hazardous zones in mammalian introns.</article-title>             <source>PLoS Comput Biol</source>             <volume>7</volume>             <fpage>e1002046</fpage>          </element-citation>
      </ref>
      <ref id="pcbi.1002529-Bernstein1">
        <label>11</label>
        <element-citation publication-type="journal" xlink:type="simple">             <person-group person-group-type="author"><name name-style="western"><surname>Bernstein</surname><given-names>BE</given-names></name><name name-style="western"><surname>Stamatoyannopoulos</surname><given-names>JA</given-names></name><name name-style="western"><surname>Costello</surname><given-names>JF</given-names></name><name name-style="western"><surname>Ren</surname><given-names>B</given-names></name><name name-style="western"><surname>Milosavljevic</surname><given-names>A</given-names></name><etal/></person-group>             <year>2010</year>             <article-title>The NIH roadmap epigenomics mapping consortium.</article-title>             <source>Nat Biotechnol</source>             <volume>28</volume>             <fpage>1045</fpage>             <lpage>1048</lpage>          </element-citation>
      </ref>
      <ref id="pcbi.1002529-Wu1">
        <label>12</label>
        <element-citation publication-type="journal" xlink:type="simple">             <person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>H</given-names></name><name name-style="western"><surname>Caffo</surname><given-names>B</given-names></name><name name-style="western"><surname>Jaffee</surname><given-names>HA</given-names></name><name name-style="western"><surname>Irizarry</surname><given-names>RA</given-names></name><name name-style="western"><surname>Feinberg</surname><given-names>AP</given-names></name></person-group>             <year>2010</year>             <article-title>Redefining CpG islands using hidden markov models.</article-title>             <source>Biostatistics</source>             <volume>11</volume>             <fpage>499</fpage>             <lpage>514</lpage>          </element-citation>
      </ref>
      <ref id="pcbi.1002529-Mularoni1">
        <label>13</label>
        <element-citation publication-type="journal" xlink:type="simple">             <person-group person-group-type="author"><name name-style="western"><surname>Mularoni</surname><given-names>L</given-names></name><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names></name><name name-style="western"><surname>Bowen</surname><given-names>T</given-names></name><name name-style="western"><surname>Gangadharan</surname><given-names>S</given-names></name><name name-style="western"><surname>Wheelan</surname><given-names>S</given-names></name><etal/></person-group>             <year>2012</year>             <article-title>Retrotransposon Ty1 integration targets specifically positioned asymmetric nucleosomal DNA segments in tRNA hotspots.</article-title>             <source>Genome Res</source>             <volume>22</volume>             <fpage>693</fpage>             <lpage>703</lpage>          </element-citation>
      </ref>
    </ref-list>
    
  </back>
</article>