uk.ac.shef.dcs.oak.jate.core.feature.indexer
Class GlobalIndex

java.lang.Object
  extended by uk.ac.shef.dcs.oak.jate.core.feature.indexer.GlobalIndex
Direct Known Subclasses:
GlobalIndexMem

public abstract class GlobalIndex
extends java.lang.Object

Index terms (term canonical form, and variants), documents, and the containing/occur-in relation between terms and documents


Field Summary
protected  int _docCounter
           
protected  int _termCounter
           
protected  int _variantCounter
           
 
Constructor Summary
GlobalIndex()
           
 
Method Summary
abstract  java.util.Set<java.lang.Integer> getDocumentIds()
           
abstract  java.util.Set<Document> getDocuments()
           
abstract  java.util.Set<java.lang.Integer> getTermCanonicalIds()
           
abstract  java.util.Set<java.lang.String> getTermsCanonical()
           
abstract  java.util.Set<java.lang.Integer> getTermVariantIds()
           
abstract  java.util.Set<java.lang.String> getTermVariants()
           
protected abstract  int indexDocument(Document d)
          Given a document, index it and return its id.
protected abstract  void indexDocWithTermsCanonical(Document d, java.util.Set<java.lang.String> terms)
          Given a document d which contains a set of terms (canonical form), index the binary relation "document contains term canonical"
protected abstract  void indexDocWithTermsCanonical(int d, java.util.Set<java.lang.Integer> terms)
          Given a document with id d which contains a set of terms (canonical form), index the binary relation "document contains term canonical"
protected abstract  int indexTermCanonical(java.lang.String term)
          Given a candidate term's canonical form, index it and return its id.
protected abstract  void indexTermCanonicalInDoc(int t, int d)
          Given a candidate term's canonical form id t found in document with id d, index the binary relation "t found_in d"
protected abstract  void indexTermCanonicalInDoc(java.lang.String t, Document d)
          Given a candidate term's canonical form t found in document d, index the binary relation "t found_in d"
protected abstract  int indexTermVariant(java.lang.String termV)
          Given a candidate term variant, index it and return its id.
protected abstract  void indexTermWithVariant(java.util.Map<java.lang.String,java.util.Set<java.lang.String>> map)
          Given a map containing [term canonical form - term variant forms], index the mapping
abstract  int retrieveCanonicalOfTermVariant(java.lang.String termVar)
          Given a term variant form, retrieve its canonical form
abstract  java.util.Set<java.lang.Integer> retrieveDocIdsContainingTermCanonical(int id)
           
abstract  java.util.Set<java.lang.Integer> retrieveDocIdsContainingTermCanonical(java.lang.String t)
           
abstract  java.util.Set<Document> retrieveDocsContainingTermCanonical(int t)
           
abstract  java.util.Set<Document> retrieveDocsContainingTermCanonical(java.lang.String t)
           
abstract  int retrieveDocument(Document d)
          Given a document, return its id.
abstract  Document retrieveDocument(int id)
          Given a document id return the document
abstract  java.lang.String retrieveTermCanonical(int id)
          Given an id, retrieve the candidate term's canonical form
abstract  int retrieveTermCanonical(java.lang.String term)
          Given a candidate term's canonical form, return its id.
abstract  java.util.Set<java.lang.Integer> retrieveTermCanonicalIdsInDoc(Document d)
           
abstract  java.util.Set<java.lang.Integer> retrieveTermCanonicalIdsInDoc(int d)
           
abstract  java.util.Set<java.lang.String> retrieveTermCanonicalInDoc(int d)
           
abstract  java.util.Set<java.lang.String> retrieveTermsCanonicalInDoc(Document d)
           
protected abstract  java.lang.String retrieveTermVariant(int id)
          Given an id of a candidate term variant, retrieve the text
abstract  java.util.Set<java.lang.String> retrieveVariantsOfTermCanonical(java.lang.String term)
          Given a term canonical form, retrieve its variant forms found in the corpus
abstract  int sizeDocHasTerms(Document d)
           
abstract  int sizeDocHasTerms(int d)
           
abstract  int sizeTermInDocs(int t)
           
abstract  int sizeTermInDocs(java.lang.String t)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

_termCounter

protected int _termCounter

_variantCounter

protected int _variantCounter

_docCounter

protected int _docCounter
Constructor Detail

GlobalIndex

public GlobalIndex()
Method Detail

indexTermCanonical

protected abstract int indexTermCanonical(java.lang.String term)
Given a candidate term's canonical form, index it and return its id.

Parameters:
term -
Returns:
the id

retrieveTermCanonical

public abstract java.lang.String retrieveTermCanonical(int id)
Given an id, retrieve the candidate term's canonical form

Parameters:
id -
Returns:

retrieveTermCanonical

public abstract int retrieveTermCanonical(java.lang.String term)
Given a candidate term's canonical form, return its id. (If the term has not been index, -1 will be returned

Parameters:
term -
Returns:

getTermCanonicalIds

public abstract java.util.Set<java.lang.Integer> getTermCanonicalIds()
Returns:
all indexed candidate term canonical form ids

getTermsCanonical

public abstract java.util.Set<java.lang.String> getTermsCanonical()
Returns:
all candidate term canonical forms

indexTermVariant

protected abstract int indexTermVariant(java.lang.String termV)
Given a candidate term variant, index it and return its id.

Parameters:
termV -
Returns:
the id

retrieveTermVariant

protected abstract java.lang.String retrieveTermVariant(int id)
Given an id of a candidate term variant, retrieve the text

Parameters:
id -
Returns:

retrieveCanonicalOfTermVariant

public abstract int retrieveCanonicalOfTermVariant(java.lang.String termVar)
Given a term variant form, retrieve its canonical form

Parameters:
termVar -
Returns:

getTermVariantIds

public abstract java.util.Set<java.lang.Integer> getTermVariantIds()

getTermVariants

public abstract java.util.Set<java.lang.String> getTermVariants()

indexDocument

protected abstract int indexDocument(Document d)
Given a document, index it and return its id.

Parameters:
d -
Returns:
the id

retrieveDocument

public abstract Document retrieveDocument(int id)
Given a document id return the document

Parameters:
id -
Returns:

retrieveDocument

public abstract int retrieveDocument(Document d)
Given a document, return its id. If the document has not been indexed, return -1

Parameters:
d -
Returns:
the id

getDocuments

public abstract java.util.Set<Document> getDocuments()
Returns:
all indexed documents

getDocumentIds

public abstract java.util.Set<java.lang.Integer> getDocumentIds()
Returns:
return all indexed document ids

indexTermWithVariant

protected abstract void indexTermWithVariant(java.util.Map<java.lang.String,java.util.Set<java.lang.String>> map)
Given a map containing [term canonical form - term variant forms], index the mapping

Parameters:
map -

retrieveVariantsOfTermCanonical

public abstract java.util.Set<java.lang.String> retrieveVariantsOfTermCanonical(java.lang.String term)
Given a term canonical form, retrieve its variant forms found in the corpus

Parameters:
term -
Returns:

indexTermCanonicalInDoc

protected abstract void indexTermCanonicalInDoc(java.lang.String t,
                                                Document d)
Given a candidate term's canonical form t found in document d, index the binary relation "t found_in d"

Parameters:
t -
d -

indexTermCanonicalInDoc

protected abstract void indexTermCanonicalInDoc(int t,
                                                int d)
Given a candidate term's canonical form id t found in document with id d, index the binary relation "t found_in d"

Parameters:
t -
d -

retrieveDocIdsContainingTermCanonical

public abstract java.util.Set<java.lang.Integer> retrieveDocIdsContainingTermCanonical(java.lang.String t)
Parameters:
t - the candidate term's canonical form in question
Returns:
the document ids of which documents contain the candidate term t

retrieveDocIdsContainingTermCanonical

public abstract java.util.Set<java.lang.Integer> retrieveDocIdsContainingTermCanonical(int id)
Parameters:
id - the candidate term's canonical form in questoin
Returns:
the document ids of which documents contain the candidate term t

retrieveDocsContainingTermCanonical

public abstract java.util.Set<Document> retrieveDocsContainingTermCanonical(java.lang.String t)
Parameters:
t - the candidate term's canonical form in question
Returns:
the documents which contain the candidate term t

retrieveDocsContainingTermCanonical

public abstract java.util.Set<Document> retrieveDocsContainingTermCanonical(int t)
Parameters:
t - the candidate term's canonical form id in question
Returns:
the documents which contain the candidate term t

sizeTermInDocs

public abstract int sizeTermInDocs(java.lang.String t)
Parameters:
t - the candidate term's canonical form
Returns:
number of documents that contain the candidate term (any variants)

sizeTermInDocs

public abstract int sizeTermInDocs(int t)
Parameters:
t - the id of candidate term's canonical form
Returns:
number of documents that contain the candidate term (any variants)

indexDocWithTermsCanonical

protected abstract void indexDocWithTermsCanonical(Document d,
                                                   java.util.Set<java.lang.String> terms)
Given a document d which contains a set of terms (canonical form), index the binary relation "document contains term canonical"

Parameters:
d -
terms - canonical forms of candidate terms found in document d

indexDocWithTermsCanonical

protected abstract void indexDocWithTermsCanonical(int d,
                                                   java.util.Set<java.lang.Integer> terms)
Given a document with id d which contains a set of terms (canonical form), index the binary relation "document contains term canonical"

Parameters:
d - id of document
terms - canonical forms of candidate terms found in document d

retrieveTermCanonicalIdsInDoc

public abstract java.util.Set<java.lang.Integer> retrieveTermCanonicalIdsInDoc(Document d)
Parameters:
d -
Returns:
the ids of canonical forms of terms found in the document d

retrieveTermCanonicalIdsInDoc

public abstract java.util.Set<java.lang.Integer> retrieveTermCanonicalIdsInDoc(int d)
Parameters:
d -
Returns:
the ids of canonical forms of terms found in the document d

retrieveTermsCanonicalInDoc

public abstract java.util.Set<java.lang.String> retrieveTermsCanonicalInDoc(Document d)
Parameters:
d -
Returns:
the canonical form of terms found in the document d

retrieveTermCanonicalInDoc

public abstract java.util.Set<java.lang.String> retrieveTermCanonicalInDoc(int d)
Parameters:
d -
Returns:
the canonical form of terms found in the document d

sizeDocHasTerms

public abstract int sizeDocHasTerms(Document d)
Parameters:
d -
Returns:
number of unique candidate terms (canonical form) found in document d

sizeDocHasTerms

public abstract int sizeDocHasTerms(int d)
Parameters:
d -
Returns:
number of unique candidate terms (canonical form) found in document d