%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %
% %
% A Bibliography on Automatic Text Categorization %
% %
% compiled and maintained by %
% %
% Fabrizio Sebastiani %
% Istituto di Elaborazione dell'Informazione %
% Consiglio Nazionale delle Ricerche %
% Via Giuseppe Moruzzi, 1 - 56124 Pisa, Italy %
% http://faure.iei.pi.cnr.it/~fabrizio/ %
% %
% %
% This is a bibliography, in BibTeX format, on automatic text %
% categorization (ATC), defined as the activity of automatically %
% building, by means of machine learning techniques, automatic text %
% classifiers, i.e. systems capable of assigning to a text %
% document one or more thematic categories from a predefined set. %
% %
% This bibliography resides at %
% http://faure.iei.pi.cnr.it/~fabrizio/ATCbibliography.bib %
% Everyone is welcome to download it as a whole and distribute it, %
% provided that it is distributed untouched. %
% %
% Everyone is also welcome to let me know either additional %
% references or corrections and additions (e.g. URLs, where %
% they are not already present) to the existing ones. %
% In general, only references specific to ATC are considered %
% pertinent to this bibliography; in particular, references that %
% #are# considered pertinent are: %
% %
% * publications that discuss novel ATC methods, novel %
% experimentation of previously known methods, or resources for %
% ATC experimentation; %
% %
% * publications that discuss applications of ATC (e.g. %
% automated indexing for Boolean IR systems, filtering, etc.). %
% %
% References that are #not# considered pertinent are: %
% %
% * publications that discuss techniques in principle useful for %
% ATC (e.g. machine learning techniques, information retrieval %
% techniques) but do not explicitly discuss their application %
% to ATC; %
% %
% * publications thet discuss related topics sometimes confused with %
% ATC; these include, in particular, text clustering (i.e. text %
% classification by unsupervised learning) and text indexing; %
% %
% * technical reports and workshop papers. Only papers that have %
% been the object of formal publication (i.e. conferences and %
% journals) are to be included in the bibliography, so as to avoid %
% its explosion and the inclusion of material bound to obsolescence. %
% %
% Concerning URLs from which to download on-line copies of the %
% papers, where possible I have included URLs with unrestricted %
% access (e.g. home pages of authors). When such URLs were not %
% available, sometimes a URL with restricted access (e.g. the %
% ACM Digital Library or the IEEE Computing Society Digital %
% Library, which are accessible to subscribers only) is indicated. %
% When this is the case, if you know of a URL with unrestricted access %
% from which the paper is also available, please let me know and I %
% will substitute the link. %
% %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@inProceedings{Adam02,
author = {Chai K. Adam and Hwee T. Ng and Hai L. Chieu},
title = {Bayesian Online Classifiers for Text Classification and Filtering},
booktitle = {Proceedings of SIGIR-02, 25th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng
and Kalervo J{\"{a}}rvelin},
publisher = {{ACM} Press, New York, {US}},
address = {Tampere, {FI}},
year = {2002},
pages = {97--104},
url = {http://doi.acm.org/10.1145/564376.564395},
abstract = {This paper explores the use of Bayesian online classifiers to
classify text documents. Empirical results indicate that these
classifiers are comparable with the best text classification
systems. Furthermore, the online approach offers the advantage of
continuous learning in the batch-adaptive text filtering task.},
}
@inProceedings{Aggarwal99,
author = {Charu C. Aggarwal and Stephen C. Gates and Philip S. Yu},
title = {On the merits of building categorization systems by supervised
clustering},
booktitle = {Proceedings of EDBT-00, 7th International Conference on Extending
Database Technology},
publisher = {{ACM} Press, New York, {US}},
year = {1999},
address = {Konstanz, {DE}},
pages = {352--356},
url = {http://doi.acm.org/10.1145/312129.312279},
abstract = {This paper investigates the use of supervised clustering in order
to create sets of categories for classification of documents. We
use information from a pre-existing taxonomy in order to
supervise the creation of a set of related clusters, though with
some freedom in defining and creating the classes. We show that
the advantage of using supervised clustering is that it is
possible to have some control over the range of subjects that one
would like the categorization system to address, but with a
precise mathematical definition of each category. We then
categorize documents using this a priori knowledge of the
definition of each category. We also discuss a new technique to
help the classifier distinguish better among closely related
clusters. Finally, we show empirically that this categorization
system utilizing a machine-derived taxonomy performs as well as a
manual categorization process, but at a far lower cost.},
}
@inProceedings{Agrawal00,
author = {Rakesh Agrawal and Roberto J. Bayardo and Ramakrishnan Srikant},
title = {{\sc Athena}: Mining-based Interactive Management of Text
Databases},
booktitle = {Proceedings of EDBT-00, 7th International Conference on Extending
Database Technulogy},
editor = {Carlo Zaniolo and Peter C. Lockemann and Marc H. Scholl and
Torsten Grust},
year = {2000},
address = {Konstanz, {DE}},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 1777},
pages = {365--379},
url = {http://www.almaden.ibm.com/cs/people/ragrawal/papers/athena.ps},
abstract = {We describe Athena: a system for creating, exploiting, and
maintaining a hierarchical arrangement of textual documents
through interactive mining-based operations. Requirements of any
such system include speed and minimal end-user effort. Athena
satisfies these requirements through linear-time classification
and clustering engines which are applied interactively to speed
the development of accurate models. Naive Bayes classifiers are
recognized to be among the best for classifying text. We show
that our specialization of the Naive Bayes classifier is
considerably more accurate (7 to 29\% absolute increase in
accuracy) than a standard implementation. Our enhancements
include using Lidstone's law of succession instead of Laplace's
law, under-weighting long documents, and over-weighting author
and subject. We also present a new interactive clustering
algorithm, C-Evolve, for topic discovery. C-Evolve first finds
highly accurate cluster digests (partial clusters), gets user
feedback to merge and correct these digests, and then uses the
classification algorithm to complete the partitioning of the
data. By allowing this interactivity in the clustering process,
C-Evolve achieves considerably higher clustering accuracy (10 to
20\% absolute increase in our experiments) than the popular
K-Means and agglomerative clustering methods.},
}
@inProceedings{Agrawal01,
author = {Rakesh Agrawal and Ramakrishnan Srikant},
title = {On integrating catalogs},
booktitle = {Proceedings of WWW-01, 10th International Conference on the World
Wide Web},
publisher = {{ACM} Press, New York, {US}},
editor = {},
year = {2001},
address = {Hong Kong, {CN}},
pages = {603--612},
url = {http://doi.acm.org/10.1145/371920.372163},
abstract = {We address the problem of integrating documents from different
sources into a master catalog. This problem is pervasive in web
marketplaces and portals. Current technology for automating this
process consists of building a classifier that uses the
categorization of documents in the master catalog to construct a
model for predicting the category of unknown documents. Our key
insight is that many of the data sources have their own
categorization, and classification accuracy can be improved by
factoring in the implicit information in these source
categorizations. We show how a Naive Bayes classification can be
enhanced to incorporate the similarity information present in
source catalogs. Our analysis and empirical evaluation show
substantial improvement in the accuracy of catalog integration.},
}
@inProceedings{Aizawa00,
author = {Akiko Aizawa},
title = {The feature quantity: an information-theoretic perspective of
tfidf-like measures},
booktitle = {Proceedings of SIGIR-00, 23rd ACM International Conference on
Research and Development in Information Retrieval},
editor = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong},
publisher = {{ACM} Press, New York, {US}},
address = {Athens, {GR}},
year = {2000},
pages = {104--111},
url = {http://doi.acm.org/10.1145/345508.345556},
abstract = {The feature quantity, a quantitative representation of
specificity introduced in this paper, is based on an information
theoretic perspective of co-occurrence events between terms and
documents. Mathematically, the feature quantity is defined as a
product of probabillty and information, and maintains a good
correspondence with the tfidf-like measures popularly used in
today's IR systems. In this paper, we present a formal
description of the feature quantity, as well as some illustrative
examples of applying such a quantity to different types of
information retrieval tasks: representative term selection and
text categorization.},
}
@inProceedings{Aizawa01,
author = {Akiko Aizawa},
title = {Linguistic Techniques to Improve the Performance of Automatic
Text Categorization},
booktitle = {Proceedings of NLPRS-01, 6th Natural Language Processing Pacific
Rim Symposium},
editor = {},
publisher = {},
address = {Tokyo, {JP}},
year = {2001},
pages = {307--314},
url = {http://www.afnlp.org/nlprs2001/pdf/0079-01.pdf},
abstract = {This paper presents a method for incorporating natural language
processing into existing text categorization procedures. Three
aspects are considered in the investigation: (i) a method for
weighting terms based on the concept of a probability weighted
amount of information, (ii) estimation of term occurrence
probabilities using a probabilistic language model, and (iii)
automatic extraction of terms based on POS tags automatically
generated by a morphological analyzer. The effects of these
considerations are examined in the experiments using
Reuters-21578 and NTCIR-J1 standard test collections.},
}
@inProceedings{Alias02,
author = {Francesc Al{\'i}as and Ignasi Iriondo and Pere Barnola},
title = {Multi-domain text classification for unit selection
text-to-speech synthesis},
booktitle = {Proceedings of ICPhS-03, 15th International Congress on Phonetic
Sciences},
address = {Barcelona, ES},
editor = {},
publisher = {},
year = {2003},
pages = {},
url = {},
abstract = {},
}
@inProceedings{AlKofahi01,
author = {Khalid Al-Kofahi and Alex Tyrrell and Arun Vachher and Tim
Travers and Peter Jackson},
title = {Combining Multiple Classifiers for Text Categorization},
booktitle = {Proceedings of CIKM-01, 10th ACM International Conference on
Information and Knowledge Management},
publisher = {{ACM} Press, New York, {US}},
editor = {Henrique Paques and Ling Liu and David Grossman},
year = {2001},
address = {Atlanta, {US}},
pages = {97--104},
url = {http://doi.acm.org/10.1145/502585.502603},
abstract = {A major problem facing online information services is how to
index and supplement large document collections with respect to a
rich set of categories. We focus upon the routing of case law
summaries to various secondary law volumes in which they should
be cited. Given the large number (> 13,000) of closely related
categories, this is a challenging task that is unlikely to
succumb to a single algorithmic solution. Our fully implemented
and recently deployed system shows that a superior classification
engine for this task can be constructed from a combination of
classifiers. The multi-classifier approach helps us leverage all
the relevant textual features and meta data, and appears to
generalize to related classification tasks.},
}
@inProceedings{Amati96,
author = {Gianni Amati and Daniela D'Aloisi and Vittorio Giannini and
Flavio Ubaldini},
title = {An Integrated System for Filtering News and Managing Distributed
Data},
booktitle = {Proceedings of PAKM-96, 1st International Conference on Practical
Aspects of Knowledge Management},
editor = {},
publisher = {},
year = {1996},
pages = {},
note = {An extended version appears as~\cite{Amati97b}},
address = {Basel, {CH}},
url = {http://airone.fub.it:8080/projects/pakm96.ps},
abstract = {With the development and diffusion of the Internet worldwide
connection, a large amount of information can be delivered to the
users. To avoid their being overflowed by the incoming data,
methods of information filtering are required. Thus, there is the
problem of determining what information is relevant to the user
and how this decision can be taken by a supporting system.
Parametric and qualitative descriptors of user's interest must be
generated. This paper presents two approaches. The first concerns
an information filtering system based on an adaptation of the
generalized probabilistic model of information retrieval. The
user profile is a vector of weighted terms which are learned from
the relevance assessment values given by the user on the training
set. Positive terms are considered relevant to the informative
need of the user, negative ones irrelevant. The relevance values
are interpreted as subjective probabilities and hence are mapped
into the real interval [0; 1]. ProFile is a filtering system for
the netnews which uses this model with a scale of 11 predefined
values of relevance. ProFile allows the user to update on-line
his profile and to check the discrepancy between his assessment
and the prediction of relevance of the system. The second
concerns the InfoAgent, a system for supporting users in
retrieving data in distributed and heterogeneous archives and
repositories. The architecture is based on the metaphor of the
software agents and incorporates innovative hints from other
fields: distributed architectures, relevance feedback and active
interfaces. The system has a cooperative and supportive role: it
understands the user's needs and learns from his behavior. Its
aim is to disengage the user from learning complex tools and from
performing tedious and repetitive actions.},
}
@inProceedings{Amati97,
author = {Gianni Amati and Fabio Crestani and Flavio Ubaldini},
title = {A learning system for selective dissemination of information},
booktitle = {Proceedings of IJCAI-97, 15th International Joint Conference on
Artificial Intelligence},
editor = {Martha E. Pollack},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
year = {1997},
pages = {764--769},
address = {Nagoya, {JP}},
url = {http://www.cs.strath.ac.uk/~fabioc/papers/97-ijcai.pdf},
abstract = {New methods and new systems are needed to filter or to
selectively distribute the increasing volume of electronic
information being produced nowadays. An effective information
filtering system is one that provides the exact information that
fulfills a user's interest with the minimum effort by the user to
describe it. Such a system will have to be adaptive to the user
changing interest. In this paper we present a learning system for
information filtering and selective information dissemination.
The learning algorithm is described and the effectiveness of the
system is evaluated in a true information filtering style.},
}
@inProceedings{Amati97a,
author = {Gianni Amati and Fabio Crestani and Flavio Ubaldini and Stefano
De Nardis},
title = {Probabilistic Learning for Information Filtering},
booktitle = {Proceedings of RIAO-97, 1st International Conference ``Recherche
d'Information Assistee par Ordinateur''},
editor = {Luc Devroye and Claude Chrisment},
address = {Montreal, {CA}},
year = {1997},
pages = {513--530},
note = {An extended version appears as~\cite{Amati99}},
url = {http://www.cs.strath.ac.uk/~fabioc/papers/97-riao.pdf},
abstract = {In this paper we describe and evaluate a learning model for
information filtering which is an adaptation of the generalised
probabilistic model of Information Retrieval. The model is based
on the concept of ``uncertainty sampling'', a technique that
allows for relevance feedback both on relevant and non relevant
documents. The proposed learning model is the core of a prototype
information filtering system called ProFile.},
}
@article{Amati97b,
author = {Gianni Amati and Daniela D'Aloisi and Vittorio Giannini and
Flavio Ubaldini},
title = {A Framework for Filtering News and Managing Distributed Data},
journal = {Journal of Universal Computer Science},
year = {1997},
number = {8},
volume = {3},
pages = {1007--1021},
url = {http://www.jucs.org/jucs_3_8/a_framework_for_filtering},
abstract = {With the development and diffusion of the Internet worldwide
connection, a large amount of information is available to the
users. Methods of information filtering and fetching are then
required. This paper presents two approaches. The first concerns
the information filtering system ProFile based on an adaptation
of the generalized probabilistic model of information retrieval.
ProFile filters the netnews and uses a scale of 11 predefined
values of relevance. ProFile allows the user to update on-line
the profile and to check the discrepancy between the assessment
and the prediction of relevance of the system. The second
concerns ABIS, an intelligent agent for supporting users in
filtering data from distributed and heterogeneous archives and
repositories. ABIS minimizes user's effort in selecting the huge
amount of available documents. The filtering engine memorizes
both user preferences and past situations. ABIS compares
documents with the past situations and finds the similarity
scores on the basis of a memory-based reasoning approach.},
}
@article{Amati99,
author = {Gianni Amati and Fabio Crestani},
title = {Probabilistic learning for selective dissemination of information},
journal = {Information Processing and Management},
pages = {633--654},
year = {1999},
number = {5},
volume = {35},
url = {http://www.cs.strath.ac.uk/~fabioc/papers/99-ipem.pdf},
abstract = {New methods and new systems are needed to filter or to
selectively distribute the increasing volume of electronic
information being produced nowadays. An effective information
filtering system is one that provides the exact information that
fulfills user's interests with the minimum effort by the user to
describe it. Such a system will have to be adaptive to the user
changing interest. In this paper we describe and evaluate a
learning model for information filtering which is an adaptation
of the generalized probabilistic model of Information Retrieval.
The model is based on the concept of `uncertainty sampling', a
technique that allows for relevance feedback both on relevant and
nonrelevant documents. The proposed learning model is the core of
a prototype information filtering system called ProFile.},
}
@inProceedings{Androutsopoulos00,
author = {Ion Androutsopoulos and John Koutsias and Konstandinos V.
Chandrinos and Constantine D. Spyropoulos},
title = {An experimental comparison of naive {B}ayesian and keyword-based
anti-spam filtering with personal e-mail messages},
booktitle = {Proceedings of SIGIR-00, 23rd ACM International Conference on
Research and Development in Information Retrieval},
editor = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong},
publisher = {{ACM} Press, New York, {US}},
address = {Athens, {GR}},
year = {2000},
pages = {160--167},
url = {http://doi.acm.org/10.1145/345508.345569},
abstract = {The growing problem of unsolicited bulk e-mail, also known as
``spam'', has generated a need for reliable anti-spam e-mail
filters. Filters of this type have so far been based mostly on
manually constructed keyword patterns. An alternative approach
has recently been proposed, whereby a Naive Bayesian classifier
is trained automatically to detect spam messages. We test this
approach on a large collection of personal e-mail messages, which
we make publicly available in "encrypted" form contributing
towards standard benchmarks. We introduce appropriate
cost-sensitive measures, investigating at the same time the
effect of attribute-set size, training-corpus size,
lemmatization, and stop lists, issues that have not been explored
in previous experiments. Finally, the Naive Bayesian filter is
compared, in terms of performance, to a filter that uses keyword
patterns, and which is part of a widely used e-mail reader.},
}
@article{Appiani01,
author = {Enrico Appiani and Francesca Cesarini and Annamaria Colla and
Massimiliano Diligenti and Marco Gori and Simone Marinai and
Giovanni Soda},
title = {Automatic document classification and indexing in high-volume
applications},
journal = {International Journal on Document Analysis and Recognition},
year = {2001},
number = {2},
volume = {4},
pages = {69--83},
url = {http://link.springer-ny.com/link/service/journals/10032/papers/1004002/10040069.pdf},
abstract = {In this paper a system for analysis and automatic indexing of
imaged documents for high-volume applications is described. This
system, named STRETCH (STorage and RETrieval by Content of imaged
documents), is based on an Archiving and Retrieval Engine, which
overcomes the bottleneck of document profiling bypassing some
limitations of existing pre-defined indexing schemes. The engine
exploits a structured document representation and can activate
appropriate methods to characterise and automatically index
heterogeneous documents with variable layout. The originality of
STRETCH lies principally in the possibility for unskilled users
to define the indexes relevant to the document domains of their
interest by simply presenting visual examples and applying
reliable automatic information extraction methods (document
classification, flexible reading strategies) to index the
documents automatically, thus creating archives as desired.
STRETCH offers ease of use and application programming and the
ability to dynamically adapt to new types of documents. The
system has been tested in two applications in particular, one
concerning passive invoices and the other bank documents. In
these applications, several classes of documents are involved.
The indexing strategy first automatically classifies the
document, thus avoiding pre-sorting, then locates and reads the
information pertaining to the specific document class.
Experimental results are encouraging overall; in particular,
document classification results fulfill the requirements of
high-volume application. Integration into production lines is
under execution.},
}
@article{Apte94,
author = {Apt\'{e}, Chi Danand and Damerau, Fred J. and Weiss, Sholom M.},
title = {Automated learning of decision rules for text categorization},
journal = {{ACM} Transactions on Information Systems},
year = {1994},
number = {3},
volume = {12},
pages = {233--251},
url = {http://www.acm.org/pubs/articles/journals/tois/1994-12-3/p233-apte/p233-apte.pdf},
abstract = {We describe the results of extensive experiments using optimized
rule-based induction methods on large document collections. The
goal of these methods is to discover automatically classification
patterns that can be used for general document categorization or
personalized filtering of free text. Previous reports indicate
that human-engineered rule-based systems, requiring many
man-YEARs of developmental efforts, have been successfully built
to ``read'' documents and assign topics to them. We show that
machine-generated decision rules appear comparable to human
performance, while using the identical rule-based representation.
In comparison with other machine-learning techniques, results on
a key benchmark from the Reuters collection show a large gain in
performance, from a previously reported 67\% recall/precision
breakeven point to 80.5\%. In the context of a very
high-dimensional feature space, several methodological
alternatives are examined, including universal versus local
dictionaries, and binary versus frequency related features.},
}
@inProceedings{Apte94a,
author = {Apt\'{e}, Chidanand and Damerau, Fred J. and Weiss, Sholom M.},
title = {Towards Language-Independent Automated Learning of Text
Categorization Models},
booktitle = {Proceedings of SIGIR-94, 17th ACM International Conference on
Research and Development in Information Retrieval},
editor = {W. Bruce Croft and Cornelis J. Van Rijsbergen},
publisher = {Springer Verlag, Heidelberg, {DE}},
address = {Dublin, {IE}},
pages = {23--30},
year = {1994},
note = {An extended version appears as~\cite{Apte94}},
url = {http://www.acm.org/pubs/articles/proceedings/ir/188490/p23-apte/p23-apte.pdf},
abstract = {We describe the results of extensive machine learning experiments
on large collections of Reuters' English and German newswires.
The goal of these experiments was to automatically discover
classification patterns that can be used for assignment of topics
to the individual newswires. Our results with the English
newswire collection show a very large gain in performance as
compared to published benchmarks, while our initial results with
the German newswires appear very promising. We present our
methodology, which seems to be insensitive to the language of the
document collections, and discuss issues related to the
differences in results that we have obtained for the two
collections.},
}
@article{Attardi98,
author = {Attardi, Giuseppe and Di Marco, Sergio and Salvi, Davide},
title = {Categorization by context},
journal = {Journal of Universal Computer Science},
year = {1998},
number = {9},
volume = {4},
pages = {719--736},
url = {http://www.jucs.org/jucs_4_9/categorisation_by_context},
abstract = {Assistance in retrieving of documents on the World Wide Web is
provided either by search engines, through keyword based queries,
or by catalogues, which organise documents into hierarchical
collections. Maintaining catalogues manually is becoming
increasingly difficult due to the sheer amount of material on the
Web, and therefore it will be soon necessary to resort to
techniques for automatic classification of documents.
Classification is traditionally performed by extracting
information for indexing a document from the document itself. The
paper describes the technique of categorisation by context, which
exploits the context perceivable from the structure of HTML
documents to extract useful information for classifying the
documents they refer to. We present the results of experiments
with a preliminary implementation of the technique.},
}
@inProceedings{Attardi99,
author = {Giuseppe Attardi and Antonio Gull{\'{\i}} and Fabrizio Sebastiani},
title = {Automatic {W}eb Page Categorization by Link and Context Analysis},
booktitle = {Proceedings of THAI-99, 1st European Symposium on Telematics,
Hypermedia and Artificial Intelligence},
editor = {Chris Hutchison and Gaetano Lanzarone},
year = {1999},
address = {Varese, {IT}},
pages = {105--119},
url = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/THAI99.pdf},
abstract = {Assistance in retrieving documents on the World Wide Web is
provided either by search engines, through keyword-based queries,
or by catalogues, which organize documents into hierarchical
collections. Maintaining catalogues manually is becoming
increasingly difficult, due to the sheer amount of material on
the Web; it is thus becoming necessary to resort to techniques
for the automatic classification of documents. Automatic
classification is traditionally performed by extracting the
information for representing a document (``indexing'') from the
document itself. The paper describes the novel technique of
categorization by context, which instead extracts useful
information for classifying a document from the context where a
URL referring to it appears. We present the results of
experimenting with Theseus, a classifier that exploits this
technique.},
}
@inProceedings{Avancini03,
author = {Henri Avancini and Alberto Lavelli and Bernardo Magnini and
Fabrizio Sebastiani and Roberto Zanoli},
title = {Expanding Domain-Specific Lexicons by Term Categorization},
year = {2003},
booktitle = {Proceedings of SAC-03, 18th ACM Symposium on Applied Computing},
address = {Melbourne, {US}},
publisher = {{ACM} Press, New York, {US}},
pages = {793--797},
url = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/SAC03c.pdf},
abstract = {We discuss an approach to the automatic expansion of
domain-specific lexicons by means of \emph{term categorization},
a novel task employing techniques from information retrieval (IR)
and machine learning (ML). Specifically, we view the expansion of
such lexicons as a process of learning previously unknown
associations between terms and \emph{domains}. The process
generates, for each $c_{i}$ in a set $C=\{c_{1},\ldots,c_{m}\}$
of domains, a lexicon $L^{i}_{1}$, bootstrapping from an initial
lexicon $L^{i}_{0}$ and a set of documents $\theta$ given as
input. The method is inspired by \emph{text categorization} (TC),
the discipline concerned with labelling natural language texts
with labels from a predefined set of domains, or categories.
However, while TC deals with documents represented as vectors in
a space of terms, we formulate the task of term categorization as
one in which terms are (dually) represented as vectors in a space
of documents, and in which terms (instead of documents) are
labelled with domains.},
}
@inProceedings{Baker98,
author = {L. Douglas Baker and Andrew K. McCallum},
title = {Distributional clustering of words for text classification},
booktitle = {Proceedings of SIGIR-98, 21st ACM International Conference on
Research and Development in Information Retrieval},
editor = {W. Bruce Croft and Alistair Moffat and Cornelis J. Van Rijsbergen
and Ross Wilkinson and Justin Zobel},
publisher = {{ACM} Press, New York, {US}},
year = {1998},
address = {Melbourne, {AU}},
pages = {96--103},
url = {http://www.cs.cmu.edu/~mccallum/papers/clustering-sigir98.ps.gz},
abstract = {We describe the application of distributional clustering to
document classification. This approach clusters words into groups
based on the distribution of class labels associated with each
word. Thus, unlike some other unsupervised
dimensionality-reduction techniques, such as latent semantic
indexing, we are able to compress the feature space much more
aggressively, while still maintaining high document
classification accuracy. Experimental results obtained on three
real-world data sets show that we can reduce the feature
dimensionality by three orders of magnitude and lose only 2\%
accuracy, significantly better than latent semantic indexing,
class-based clustering, feature selection by mutual information,
or Markov-blanket-based feature selection. We also show that less
aggressive clustering sometimes results in improved
classification accuracy over classification without clustering.},
}
@inProceedings{Bao01,
author = {Yongguang Bao and Satoshi Aoyama and Xiaoyong Du and Kazutaka
Yamada and Naohiro Ishii},
title = {A Rough Set-Based Hybrid Method to Text Categorization},
booktitle = {Proceedings of WISE-01, 2nd International Conference on Web
Information Systems Engineering},
editor = {M. Tamer {\"{O}}zsu and Hans-J{\"{o}}rg Schek and Katsumi Tanaka
and Yanchun Zhang and Yahiko Kambayashi},
publisher = {{IEEE} Computer Society Press, Los Alamitos, {US}},
year = {2001},
address = {Kyoto, {JP}},
pages = {254--261},
url = {http://dlib.computer.org/conferen/wise/1393/pdf/volume1/13930254.pdf},
abstract = {In this paper we present a hybrid text categorization method
based on Rough Sets theory. A central problem in good text
Classification for information filtering and retrieval (IF/IR) is
the high dimensionality of the data. It may contain many
unnecessary and irrelevant features. To cope with this problem,
we propose a hybrid technique using Latent Semantic Indexing
(LSI) and Rough Sets theory (RS) to alleviate this situation.
Given corpora of documents and a training set of examples of
classified documents, the technique locates a minimal set of
co-ordinate keywords to distinguish between classes of documents,
reducing the dimensionality of the keyword vectors. This
simplifies the creation of knowledge-based IF/IR systems, speeds
up their operation, and allows easy editing of the rule bases
employed. Besides, we generate several knowledge base instead of
one knowledge base for the classification of new object, hoping
that the combination of answers of the multiple knowledge bases
result in better performance. Multiple knowledge bases can be
formulated precisely and in a unified way within the framework of
RS. This paper describes the proposed technique, discusses the
integration of a keyword acquisition algorithm, Latent Semantic
Indexing (LSI) with Rough Set-based rule generate algorithm, and
provides experimental results. The test results show the hybrid
method is better than the previous rough set-based approach.},
}
@inProceedings{Basili00,
author = {Roberto Basili and Alessandro Moschitti and Maria T. Pazienza},
title = {Language-Sensitive Text Classification},
booktitle = {Proceeding of RIAO-00, 6th International Conference ``Recherche
d'Information Assistee par Ordinateur''},
editor = {},
address = {Paris, {FR}},
year = {2000},
pages = {331--343},
url = {},
abstract = {It is a traditional belief that in order to scale-up to more
effective retrieval and access methods modern Information
Retrieval has to consider more the text content. The modalities
and techniques to fit this objectives are still under discussion.
More empirical evidence is required to determine the suitable
linguistic levels for modeling each IR subtask (e.g. information
zoning, parsing, feature selection for indexing,...) and the
corresponding use of this information. In this paper an original
classification model sensitive to document syntactic information
and characterized by a novel inference method is described.
Extensive experimental evidence has been derived on real test
data and also from well-established academic test sets. The
results show that a significant improvement can be derived using
the proposed inference model. Also the role of linguistic
preprocessing seems to provide positive effects on the
performance. POS tagging and recognition of Proper Nouns received
a specific experimental attention and provided significant
effects on measured accuracy.},
}
@inProceedings{Basili01,
author = {Roberto Basili and Alessandro Moschitti and Maria T. Pazienza},
title = {{NLP}-driven {IR}: Evaluating Performances over a Text
Classification task},
booktitle = {Proceeding of IJCAI-01, 17th International Joint Conference on
Artificial Intelligence},
editor = {Bernhard Nebel},
address = {Seattle, {US}},
year = {2001},
pages = {1286--1291},
url = {},
abstract = {Although several attempts have been made to introduce Natural
Language Processing (NLP) techniques in Information Retrieval,
most ones failed to prove their effectiveness in increasing
performances. In this paper Text Classification (TC) has been
taken as the IR task and the effect of linguistic capabilities of
the underlying system have been studied. A novel model for TC,
extending a well know statistical model (i.e. Rocchio¹s formula
[Ittner et al., 1995]) and applied to linguistic features has
been defined and experimented. The proposed model represents an
effective feature selection methodology. All the experiments
result in a significant improvement with respect to other purely
statistical methods (e.g. [Yang, 1999]), thus stressing the
relevance of the available linguistic information. Moreover, the
derived classifier reachs the performance (about 85\%) of the
best known models (i.e. Support Vector Machines (SVM) and
k-Nearest Neighbour (KNN)) characterized by an higher
computational complexity for training and processing.},
}
@inProceedings{Basili01a,
author = {Roberto Basili and Alessandro Moschitti and Maria T. Pazienza},
title = {An hybrid approach to optimize feature selection process in text
classification},
booktitle = {Proceedings of AI*IA-01, 7th Congress of the Italian Association
for Artificial Intelligence},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2175},
editor = {Floriana Esposito},
year = {2001},
pages = {320--325},
address = {Bari, {IT}},
url = {http://link.springer.de/link/service/series/0558/papers/2175/21750320.pdf},
abstract = {Feature selection and weighting are the primary activity of every
learning algorithm for text classification. Traditionally these
tasks are carried out individually in two distinct phases: the
first is the global feature selection during a corpus
pre-processing and the second is the application of the feature
weighting model. This means that two (or several) different
techniques are used to optimize the performances even if a single
algorithm may have more chances to operate the right choices.
When the complete feature set is available, the classifier
learning algorithm can better relate to the suitable
representation level the different complex features like
linguistic ones (e.g. syntactic categories associated to words in
the training material or terminological expressions). In [3] it
has been suggested that classifiers based on generalized Rocchio
formula can be used to weight features in category profiles in
order to exploit the selectivity of linguistic information
techniques in text classification. In this paper, a systematic
study aimed to understand the role of Rocchio formula in
selection and weighting of linguistic features will be described.},
}
@inProceedings{Basili01b,
author = {Roberto Basili and Alessandro Moschitti},
title = {A robust model for intelligent text classification},
booktitle = {Proceedings of ICTAI-01, 13th IEEE International Conference on
Tools with Artificial Intelligence},
publisher = {{IEEE} Computer Society Press, Los Alamitos, {US}},
editor = {},
year = {2001},
pages = {265--272},
address = {Dallas, {US}},
url = {http://dlib.computer.org/conferen/ictai/1417/pdf/14170265.pdf},
abstract = {Methods for taking into account linguistic content into text
retrieval are receiving a growing attention [16],[14]. Text
categorization is an interesting area for evaluating and
quantifying the impact of linguistic information. Works in text
retrieval through Internet suggest that embedding linguistic
information at a suitable level within traditional quantitative
approaches (e.g. sense distinctions for query expansion as in
[14]) is the crucial issue able to bring the experimental stage
to operational results.This kind of representational problem is
also studied in this paper where traditional methods for
statistical text categorization are augmented via a systematic
use of linguistic information. Again, as in [14], the addition of
NLP capabilities also suggested a different application of
existing methods in revised forms. This paper presents an
extension of the Rocchio formula [11] as a feature weighting and
selection model used as a basis for multilingual Information
Extraction. It allows an effective exploitation of the available
linguistic information that better emphasizes this latter with
significant both data compression and accuracy. The results is an
original statistical classifier fed with linguistic (i.e. more
complex) features and characterized by the novel feature
selection and weighting model. It outperforms existing systems by
keeping most of their interesting properties (i.e. easy
implementation, low complexity and high scalability). Extensive
tests of the model suggest its application as a viable and robust
tool for large scale text classification and filtering, as well
as a basic module for more complex scenarios.},
}
@article{Bayer98,
author = {Thomas Bayer and Ulrich Kressel and Heike Mogg-Schneider and
Ingrid Renz},
title = {Categorizing paper documents. A generic system for domain and
language independent text categorization},
journal = {Computer Vision and Image Understanding},
year = {1998},
number = {3},
volume = {70},
pages = {299--306},
url = {http://www.idealibrary.com/links/doi/10.1006/cviu.1998.0687/pdf},
abstract = {Text categorization assigns predefined categories to either
electronically available texts or those resulting from document
image analysis. A generic system for text categorization is
presented which is based on statistical analysis of
representative text corpora. Significant features are
automatically derived from training texts by selecting substrings
from actual word forms and applying statistical information and
general linguistic knowledge. The dimension of the feature
vectors is then reduced by linear transformation, keeping the
essential information. The classification is a minimum
least-squares approach based on polynomials. The described system
can be efficiently adapted to new domains or different languages.
In application, the adapted text categorizers are reliable, fast,
and completely automatic. Two example categorization tasks
achieve recognition scores of approximately 80\% and are very
robust against recognition or typing errors.},
}
@inProceedings{Bekkerman01,
author = {Ron Bekkerman and Ran El-Yaniv and Naftali Tishby and Yoad Winter},
title = {On Feature Distributional Clustering for Text Categorization},
booktitle = {Proceedings of SIGIR-01, 24th ACM International Conference on
Research and Development in Information Retrieval},
editor = {W. Bruce Croft and David J. Harper and Donald H. Kraft and Justin
Zobel},
publisher = {{ACM} Press, New York, {US}},
address = {New Orleans, {US}},
year = {2001},
pages = {146--153},
url = {http://www.cs.huji.ac.il/labs/learning/Papers/sigir.ps.gz},
abstract = {We describe a text categorization approach that is based on a
combination of feature distributional clusters with a support
vector machine (SVM) classifier. Our feature selection approach
employs distributional clustering of words via the recently
introduced information bottleneck method, which generates a more
efficient word-cluster representation of documents. Combined with
the classification power of an SVM, this method yields high
performance text categorization that can outperform other recent
methods in terms of categorization accuracy and representation
efficiency. Comparing the accuracy of our method with other
techniques, we observe significant dependency of the results on
the data set. We discuss the potential reasons for this
dependency.},
}
@inProceedings{Bel03,
author = {Nuria Bel and Cornelis H. Koster and Marta Villegas},
title = {Cross-lingual text categorization},
booktitle = {Proceedings of ECDL-03, 7th European Conference on Research and
Advanced Technology for Digital Libraries},
editor = {Traugott Koch and Torvik S{\o}lvberg, Ingeborg},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2769},
year = {2003},
address = {Trondheim, {NO}},
pages = {126--139},
url = {},
abstract = {},
}
@inProceedings{Benkhalifa99,
author = {Benkhalifa, Mohamed and Bensaid, Amine and Mouradi, Abdelhak},
title = {Text categorization using the semi-supervised fuzzy c-means
algorithm},
booktitle = {Proceedings of NAFIPS-99, 18th International Conference of the
North American Fuzzy Information Processing Society},
address = {New York, {US}},
pages = {561--565},
year = {1999},
url = {},
abstract = {Text categorization (TC) is the automated assignment of text
documents to predefined categories based on document contents. TC
has become very important in the information retrieval area,
where information needs have tremendously increased with the
rapid growth of textual information sources such as the Internet.
We compare, for text categorization, two partially supervised (or
semi-supervised) clustering algorithms: the Semi-Supervised
Agglomerative Hierarchical Clustering (ssAHC) algorithm (A. Amar
et al., 1997) and the Semi-Supervised Fuzzy-c-Means (ssFCM)
algorithm (M. Amine et al., 1996). This (semi-supervised)
learning paradigm falls somewhere between the fully supervised
and the fully unsupervised learning schemes, in the sense that it
exploits both class information contained in labeled data
(training documents) and structure information possessed by
unlabeled data (test documents) in order to produce better
partitions for test documents. Our experiments, make use of the
Reuters 21578 database of documents and consist of a binary
classification for each of the ten most populous categories of
the Reuters database. To convert the documents into vector form,
we experiment with different numbers of features, which we
select, based on an information gain criterion. We verify
experimentally that ssFCM both outperforms and takes less time
than the Fuzzy-c-Means (FCM) algorithm. With a smaller number of
features, ssFCM's performance is also superior to that of
ssAHC's. Finally ssFCM results in improved performance and faster
execution time as more weight is given to training documents.},
}
@article{Benkhalifa01,
author = {Mohammed Benkhalifa and Abdelhak Mouradi and Houssaine Bouyakhf},
title = {Integrating External Knowledge to Supplement Training Data in
Semi-Supervised Learning for Text Categorization},
journal = {Information Retrieval},
number = {2},
volume = {4},
pages = {91--113},
year = {2001},
url = {http://www.wkap.nl/article.pdf?351286},
abstract = {Text Categorization (TC) is the automated assignment of text
documents to predefined categories based on document contents. TC
has been an application for many learning approaches, which prove
effective. Nevertheless, TC provides many challenges to machine
learning. In this paper, we suggest, for text categorization, the
integration of external WordNet lexical information to supplement
training data for a semi-supervised clustering algorithm which
can learn from both training and test documents to classify new
unseen documents. This algorithm is the ``Semi-Supervised Fuzzy
c-Means'' (ssFCM). Our experiments use Reuters 21578 database and
consist of binary classifications for categories selected from
the 115 TOPICS classes of the Reuters collection. Using the
Vector Space Model, each document is represented by its original
feature vector augmented with external feature vector generated
using WordNet. We verify experimentally that the integration of
WordNet helps ssFCM improve its performance, effectively
addresses the classification of documents into categories with
few training documents and does not interfere with the use of
training data.},
}
@article{Benkhalifa01a,
author = {Mohammed Benkhalifa and Abdelhak Mouradi and Houssaine Bouyakhf},
title = {Integrating {WordNet} knowledge to supplement training data in
semi-supervised agglomerative hierarchical clustering for text
categorization},
journal = {International Journal of Intelligent Systems},
pages = {929--947},
year = {2001},
volume = {16},
number = {8},
url = {http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=84503376&PLACEBO=IE.pdf},
abstract = {The text categorization (TC) is the automated assignment of text
documents to predefined categories based on document contents. TC
has been an application for many learning approaches. which
proved effective. Nevertheless, TC provides many challenges to
machine learning. In this paper. we suggest, for text
categorization, the integration of external WordNet lexical
information to supplement training data for a semi-supervised
clustering algorithm which (i) uses a finite design set of
labeled data to (ii) help agglomerative hierarchical clustering
algorithms (AHC) partition a finite set of unlabeled data and
then (iii) terminates without the capacity to classify other
objects. This algorithm is the "semi-supervised agglomerative
hierarchical clustering algorithm" (ssAHC). Our experiments use
Reuters 21578 database and consist of binary classifications for
categories selected from the 89 TOPICS classes of the Reuters
collection. Using the vector space model (VSM), each document is
represented by its original feature vector augmented with
external feature vector generated using WordNet. We verify
experimentally that the integration of WordNet helps ssAHC
improve its performance, effectively addresses the classification
of documents into categories with few training documents. and
does not interfere with the use of training data.},
}
@inProceedings{Bennett02,
author = {Paul N. Bennett and Susan T. Dumais and Eric Horvitz},
title = {Probabilistic combination of text classifiers using reliability
indicators: models and results},
booktitle = {Proceedings of SIGIR-02, 25th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng
and Kalervo J{\"{a}}rvelin},
publisher = {{ACM} Press, New York, {US}},
address = {Tampere, {FI}},
year = {2002},
pages = {207--214},
url = {http://doi.acm.org/10.1145/564376.564413},
abstract = {The intuition that different text classifiers behave in
qualitatively different ways has long motivated attempts to build
a better metaclassifier via some combination of classifiers. We
introduce a probabilistic method for combining classifiers that
considers the context-sensitive reliabilities of contributing
classifiers. The method harnesses reliability
indicators---variables that provide a valuable signal about the
performance of classifiers in different situations. We provide
background, present procedures for building metaclassifiers that
take into consideration both reliability indicators and
classifier outputs, and review a set of comparative studies
undertaken to evaluate the methodology.},
}
@inProceedings{Bennett03,
author = {Paul N. Bennett},
title = {Using asymmetric distributions to improve text classifier
probability estimates},
booktitle = {Proceedings of SIGIR-03, 26th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Jamie Callan and Gordon Cormack and Charles Clarke and David
Hawking and Alan Smeaton},
publisher = {{ACM} Press, New York, {US}},
address = {Toronto, {CA}},
year = {2003},
pages = {111--118},
url = {http://doi.acm.org/10.1145/860435.860457},
abstract = {Text classifiers that give probability estimates are more readily
applicable in a variety of scenarios. For example, rather than
choosing one set decision threshold, they can be used in a
Bayesian risk model to issue a run-time decision which minimizes
a user-specified cost function dynamically chosen at prediction
time. However, the quality of the probability estimates is
crucial. We review a variety of standard approaches to converting
scores (and poor probability estimates) from text classifiers to
high quality estimates and introduce new models motivated by the
intuition that the empirical score distribution for the
"extremely irrelevant", "hard to discriminate", and "obviously
relevant" items are often significantly different. Finally, we
analyze the experimental performance of these models over the
outputs of two text classifiers. The analysis demonstrates that
one of these models is theoretically attractive (introducing few
new parameters while increasing flexibility), computationally
efficient, and empirically preferable.},
}
@inProceedings{Biebricher88,
author = {Peter Biebricher and Norbert Fuhr and Gerhard Knorz and Gerhard
Lustig and Michael Schwantner},
title = {The automatic indexing system {AIR/PHYS}. {F}rom research to
application},
booktitle = {Proceedings of SIGIR-88, 11th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Yves Chiaramella},
publisher = {{ACM} Press, New York, {US}},
address = {Grenoble, {FR}},
pages = {333--342},
year = {1988},
note = {Reprinted in Karen Sparck Jones and Peter Willett (eds.),
``Readings in Information Retrieval'', Morgan Kaufmann, San
Francisco, US, 1997, pp.\ 513--517.},
url = {http://www.acm.org/pubs/articles/proceedings/ir/62437/p333-biebricher/p333-biebricher.pdf},
abstract = {Since October 1985, the automatic indexing system AIR/PHYS has
been used in the input production of the physics data base of the
Fachinformationszentrum Karlsruhe/West Germany. The texts to be
indexed are abstracts written in English. The system of
descriptors is prescribed. For the application of the AIR/PHYS
system a large-scale dictionary containing more than 600000
word-descriptor relations resp. phrase-descriptor relations has
been developed. Most of these relations have been obtained by
means of statistical and heuristical methods. In consequence, the
relation system is rather imperfect. Therefore, the indexing
system needs some fault-tolerating features. An appropriate
indexing approach and the corresponding structure of the AIR/PHYS
system are described. Finally, the conditions of the application
as well as problems of further development are discussed.},
}
@inProceedings{Bigi03,
author = {Brigitte Bigi},
title = {Using {K}ullback-{L}eibler distance for text categorization},
booktitle = {Proceedings of ECIR-03, 25th European Conference on Information
Retrieval},
publisher = {Springer Verlag},
editor = {Fabrizio Sebastiani},
address = {Pisa, {IT}},
year = {2003},
pages = {305--319},
url = {http://link.springer.de/link/service/series/0558/papers/2633/26330305.pdf},
abstract = {A system that performs text categorization aims to assign
appropriate categories from a predefined classification scheme to
incoming documents. These assignments might be used for varied
purposes such as filtering, or retrieval. This paper introduces a
new effective model for text categorization with great corpus
(more or less 1 million documents). Text categorization is
performed using the Kullback-Leibler distance between the
probability distribution of the document to classify and the
probability distribution of each category. Using the same
representation of categories, experiments show a significant
improvement when the above mentioned method is used. KLD method
achieve substantial improvements over the tfidf performing
method.},
}
@article{Blei03,
author = {David M. Blei and Andrew Y. Ng and Michael I. Jordan},
title = {Latent {D}irichlet Allocation},
journal = {Journal of Machine Learning Research},
volume = {3},
pages = {993--1022},
year = {2003},
url = {http://www.ai.mit.edu/projects/jmlr/papers/volume3/blei03a/blei03a.pdf},
abstract = {We describe latent Dirichlet allocation (LDA), a generative
probabilistic model for collections of discrete data such as text
corpora. LDA is a three-level hierarchical Bayesian model, in
which each item of a collection is modeled as a finite mixture
over an underlying set of topics. Each topic is, in turn, modeled
as an infinite mixture over an underlying set of topic
probabilities. In the context of text modeling, the topic
probabilities provide an explicit representation of a document.
We present efficient approximate inference techniques based on
variational methods and an EM algorithm for empirical Bayes
parameter estimation. We report results in document modeling,
text classification, and collaborative filtering, comparing to a
mixture of unigrams model and the probabilistic LSI model.},
}
@article{Bloedorn98,
author = {Eric Bloedorn and Ryszard S. Michalski},
title = {Data-Driven Constructive Induction},
journal = {{IEEE} Intelligent Systems},
year = {1998},
number = {2},
volume = {13},
pages = {30--37},
url = {http://dlib.computer.org/ex/books/ex1998/pdf/x2030.pdf},
abstract = {An inductive learning program¹s ability to find an accurate
hypothesis can depend on the quality of the representation space.
The authors developed a data-driven constructive-induction method
that uses multiple operators to improve the representation space.
They applied it to two real-world problems.},
}
@inProceedings{Blosseville92,
author = {M.J. Blosseville and Georges Hebrail and M.G. Montell and N.
Penot},
title = {Automatic document classification: natural langage processing and
expert system techniques used together},
booktitle = {Proceedings of SIGIR-92, 15th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Nicholas J. Belkin and Peter Ingwersen and Annelise Mark
Pejtersen},
publisher = {{ACM} Press, New York, {US}},
address = {Kobenhavn, {DK}},
pages = {51--57},
year = {1992},
url = {http://www.acm.org/pubs/articles/proceedings/ir/133160/p51-blosseville/p51-blosseville.pdf},
abstract = {In this paper we describe an automated method of classifying
research project descriptions: a human expert classifies a sample
set of projects into a set of disjoint and pre-defined classes,
and then the computer learns from this sample how to classify new
projects into these classes. Both textual and non-textual
information associated with the projects are used in the learning
and classification phases. Textual information is processed by
two methods of analysis: a natural language analysis followed by
a statistical analysis. Non-textual information is processed by a
symbolic learning technique. We present the results of some
experiments done on real data: two different classifications of
our research projects.},
}
@article{Borko63,
author = {Harold Borko and Myrna Bernick},
title = {Automatic document classification},
journal = {Journal of the Association for Computing Machinery},
year = {1963},
volume = {10},
number = {2},
pages = {151--161},
url = {http://www.acm.org/pubs/articles/journals/jacm/1963-10-2/p151-borko/p151-borko.pdf},
}
@article{Borko64,
author = {Harold Borko and Myrna Bernick},
title = {Automatic document classification. Part II: additional
experiments},
journal = {Journal of the Association for Computing Machinery},
year = {1964},
volume = {11},
number = {2},
pages = {138--151},
url = {http://www.acm.org/pubs/articles/journals/jacm/1964-11-2/p138-borko/p138-borko.pdf},
abstract = {This study reports the results of a series of experiments in the
techniques of automatic document classifications. Two different
classification schedules are compared along with two methods of
automatically classifying documents into categories. It is
concluded that, while there is no significant difference in the
predictive efficiency between the Bayesian and the Factor Score
methods, automatic document classification is enhanced by the use
of a factor-analytically-derived classification schedule.
Approximately 55 percent of the documents were automatedly and
correctly classified.},
}
@inProceedings{Brank02a,
author = {Janez Brank and Marko Grobelnik and Natasa Mili{\'{c}}-Frayling
and Dunja Mladeni{\'{c}}},
title = {Feature selection using support vector machines},
booktitle = {Proceedings of the 3rd International Conference on Data Mining
Methods and Databases for Engineering, Finance, and Other Fields},
year = {2002},
pages = {},
address = {Bologna, {IT}},
url = {http://www.brank.org/msr/FsNormal/Bologna/bologna-paper-4.pdf},
abstract = {Text categorization is the task of classifying natural language
documents into a set of predefined categories. Documents are
typically represented by sparse vectors under the vector space
model, where each word in the vocabulary is mapped to one
coordinate axis and its occurrence in the document gives rise to
one nonzero component in the vector representing that document.
When training classifiers on large collections of documents, both
the time and memory requirements connected with processing of
these vectors may be prohibitive. This calls for using a feature
selection method, not only to reduce the number of features but
also to increase the sparsity of document vectors. We propose a
feature selection method based on linear Support Vector Machines
(SVMs). First, we train the linear SVM on a subset of training
data and retain only those features that correspond to highly
weighted components (in absolute value sense) of the normal to
the resulting hyperplane that separates positive and negative
examples. This reduced feature space is then used to train a
classifier over a larger training set because more documents now
fit into the same amount of memory. In our experiments we compare
the effectiveness of the SVM -based feature selection with that
of more traditional feature selection methods, such as odds ratio
and information gain, in achieving the desired tradeoff between
the vector sparsity and the classification performance.
Experimental results indicate that, at the same level of vector
sparsity, feature selection based on SVM normals yields better
classification performance than odds ratio- or information
gainbased feature selection when linear SVM classifiers are used.},
}
@inProceedings{Bruckner97,
author = {T. Bruckner},
title = {The text categorization system {TEKLIS} at {TREC-6}},
booktitle = {Proceedings of TREC-6, 6th Text Retrieval Conference},
publisher = {National Institute of Standards and Technology, Gaithersburg, {US}},
editor = {Ellen M. Voorhees and Donna K. Harman},
year = {1997},
address = {Gaithersburg, {US}},
pages = {619--621},
url = {http://trec.nist.gov/pubs/trec6/papers/siemens.ps.gz},
abstract = {The article documents the author's participation in the filtering
and routing tasks of TREC-6 with the commercial filtering system
TEKLIS. TEKLIS is a training based statistical categorization
system which incorporates shallow linguistic processing and fuzzy
set methods. The author presents the core technology of TEKLIS,
the results on the filtering and routing tasks and a discussion
of the insights gained through participation in the exercise.},
}
@inProceedings{Cai03,
author = {Lijuan Cai and Thomas Hofmann},
title = {Text categorization by boosting automatically extracted concepts},
booktitle = {Proceedings of SIGIR-03, 26th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Jamie Callan and Gordon Cormack and Charles Clarke and David
Hawking and Alan Smeaton},
publisher = {{ACM} Press, New York, {US}},
address = {Toronto, {CA}},
year = {2003},
pages = {182--189},
url = {http://doi.acm.org/10.1145/860435.860469},
abstract = {A novel maximal figure-of-merit (MFoM) learning approach to text
categorization is proposed. Different from the conventional
techniques, the proposed MFoM method attempts to integrate any
performance metric of interest (e.g. accuracy, recall, precision,
or F1 measure) into the design of any classifier. The
corresponding classifier parameters are learned by optimizing an
overall objective function of interest. To solve this highly
nonlinear optimization problem, we use a generalized
probabilistic descent algorithm. The MFoM learning framework is
evaluated on the Reuters-21578 task with LSI-based feature
extraction and a binary tree classifier. Experimental results
indicate that the MFoM classifier gives improved F1 and enhanced
robustness over the conventional one. It also outperforms the
popular SVM method in micro-averaging F1. Other extensions to
design discriminative multiple-category MFoM classifiers for
application scenarios with new performance metrics could be
envisioned too.},
}
@article{Carbonell00,
author = {Jaime Carbonell and William W. Cohen and Yiming Yang},
title = {Guest editors' introduction to the special issue on machine
learning and information retrieval},
journal = {Machine Learning},
volume = {39},
number = {2/3},
pages = {99--101},
year = {2000},
url = {http://www.wkap.nl/article.pdf?255754},
}
@inCollection{Caropreso01,
author = {Maria Fernanda Caropreso and Stan Matwin and Fabrizio Sebastiani},
title = {A learner-independent evaluation of the usefulness of statistical
phrases for automated text categorization},
year = {2001},
booktitle = {Text Databases and Document Management: Theory and Practice},
editor = {Amita G. Chin},
publisher = {Idea Group Publishing},
address = {Hershey, {US}},
pages = {78--102},
url = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/TD01a.pdf},
abstract = {In this work we investigate the usefulness of {\em $n$-grams} for
document indexing in text categorization (TC). We call $n$-gram a
set $g_k$ of $n$ word stems, and we say that $g_k$ occurs in a
document $d_j$ when a sequence of words appears in $d_j$ that,
after stop word removal and stemming, consists exactly of the $n$
stems in $g_k$, in some order. Previous researches have
investigated the use of $n$-grams (or some variant of them) in
the context of specific learning algorithms, and thus have not
obtained general answers on their usefulness for TC. In this work
we investigate the usefulness of $n$-grams in TC independently of
any specific learning algorithm. We do so by applying feature
selection to the pool of all $k$-grams ($k\leq n$), and checking
how many $n$-grams score high enough to be selected in the top
$\sigma$ $k$-grams. We report the results of our experiments,
using various feature selection measures and varying values of
$\sigma$, performed on the {\sc Reuters-21578} standard TC
benchmark. We also report results of making actual use of the
selected $n$-grams in the context of a linear classifier induced
by means of the Rocchio method.},
}
@inProceedings{Carreras01,
author = {Xavier Carreras and Llu\'{\i}s M\'arquez},
title = {Boosting Trees for Anti-Spam Email Filtering},
year = {2001},
editor = {},
booktitle = {Proceedings of RANLP-01, 4th International Conference on Recent
Advances in Natural Language Processing},
address = {Tzigov Chark, {BG}},
pages = {},
url = {http://www.lsi.upc.es/~carreras/pub/boospam.ps},
}
@inProceedings{Cavnar94,
author = {William B. Cavnar and John M. Trenkle},
title = {N-Gram-Based Text Categorization},
booktitle = {Proceedings of SDAIR-94, 3rd Annual Symposium on Document
Analysis and Information Retrieval},
publisher = {},
editor = {},
year = {1994},
address = {Las Vegas, {US}},
pages = {161--175},
url = {http://www.nonlineardynamics.com/trenkle/papers/sdair-94-bc.ps.gz},
abstract = {Text categorization is a fundamental task in doc-ument
processing, allowing the automated handling of enormous streams
of documents in electronic form. One difficulty in handling some
classes of documents is the presence of different kinds of
textual errors, such as spelling and grammatical errors in email,
and character recognition errors in documents that come through
OCR. Text categorization must work reliably on all input, and
thus must tolerate some level of these kinds of problems. We
describe here an N-gram-based approach to text categorization
that is tolerant of textual errors. The system is small, fast and
robust. This system worked very well for language classification,
achieving in one test a 99.8\% correct classification rate on
Usenet newsgroup articles written in different languages. The
system also worked reasonably well for classifying articles from
a number of different computer-oriented newsgroups according to
subject, achieving as high as an 80\% correct classification
rate. There are also several obvious directions for improving the
system¹s classification performance in those cases where it did
not do as well. The system is based on calculating and comparing
profiles of N-gram frequencies. First, we use the system to
compute profiles on training set data that represent the various
categories, e.g., language samples or newsgroup content samples.
Then the system computes a profile for a particular document that
is to be classified. Finally, the system computes a distance
measure between the document¹s profile and each of the category
profiles. The system selects the category whose profile has the
smallest distance to the document¹s profile. The profiles
involved are quite small, typically 10K bytes for a category
training set, and less than 4K bytes for an individual document.
Using N-gram frequency profiles provides a simple and reliable
way to categorize documents in a wide range of classification
tasks.},
}
@inProceedings{Ceci03,
author = {Michelangelo Ceci and Donato Malerba},
title = {Hierarchical Classification of {HTML} Documents with {WebClassII}},
booktitle = {Proceedings of ECIR-03, 25th European Conference on Information
Retrieval},
publisher = {Springer Verlag},
editor = {Fabrizio Sebastiani},
address = {Pisa, {IT}},
year = {2003},
pages = {57--72},
url = {http://link.springer.de/link/service/series/0558/papers/2633/26330057.pdf},
abstract = {This paper describes a new method for the classification of a
HTML document into a hierarchy of categories. The hierarchy of
categories is involved in all phases of automated document
classification, namely feature extraction, learning, and
classification of a new document. The innovative aspects of this
work are the feature selection process, the automated threshold
determination for classification scores, and an experimental
study on real-word Web documents that can be associated to any
node in the hierarchy. Moreover, a new measure for the evaluation
of system performances has been introduced in order to compare
three different techniques (flat, hierarchical with proper
training sets, hierarchical with hierarchical training sets). The
method has been implemented in the context of a client-server
application, named WebClassII. Results show that for hierarchical
techniques it is better to use hierarchical training sets.},
}
@inProceedings{Cerny83,
author = {Barbara A. Cerny and Anna Okseniuk and J. Dennis Lawrence},
title = {A fuzzy measure of agreement between machine and manual
assignment of documents to subject categories},
booktitle = {Proceedings of ASIS-83, 46th Annual Meeting of the American
Society for Information Science},
publisher = {American Society for Information Science, Washington, {US}},
editor = {Raymond F. Vondran and Anne Caputo and Carol Wasserman and
Richard A. Diener},
year = {1983},
address = {Washington, {US}},
pages = {265},
url = {},
}
@inProceedings{Chakrabarti97,
author = {Soumen Chakrabarti and Byron E. Dom and Rakesh Agrawal and
Prabhakar Raghavan},
title = {Using taxonomy, discriminants, and signatures for navigating in
text databases},
booktitle = {Proceedings of VLDB-97, 23rd International Conference on Very
Large Data Bases},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
editor = {Matthias Jarke and Michael J. Carey and Klaus R. Dittrich and
Frederick H. Lochovsky and Pericles Loucopoulos and Manfred A.
Jeusfeld},
year = {1997},
address = {Athens, {GR}},
pages = {446--455},
url = {http://www.vldb.org/conf/1997/P446.PDF},
note = {An extended version appears as~\cite{Chakrabarti98c}},
abstract = {We explore how to organize a text database hierarchically to aid
better searching and browsing. We propose to exploit the natural
hierarchy of topics, or taxonomy, that many corpora, such as
internet directories, digital libraries, and patent databases
enjoy. In our system, the user navigates through the query
response not as a flat unstructured list, but embedded in the
familiar taxonomy, and annotated with document signatures
computed dynamically with respect to where the user is located at
any time. We show how to update such databases with new documents
with high speed and accuracy. We use techniques from statistical
pattern recognition to efficiently separate the feature words or
discriminants from the noise words at each node of the taxonomy.
Using these, we build a multi-level classifier. At each node,
this classifier can ignore the large number of noise words in a
document. Thus the classifier has a small model size and is very
fast. However, owing to the use of context-sensitive features,
the classifier is very accurate. We report on experiences with
the Reuters newswire benchmark, the US Patent database, and web
document samples from {{\sc Yahoo!}}\.},
}
@article{Chakrabarti98c,
author = {Soumen Chakrabarti and Byron E. Dom and Rakesh Agrawal and
Prabhakar Raghavan},
title = {Scalable feature selection, classification and signature
generation for organizing large text databases into hierarchical
topic taxonomies},
journal = {Journal of Very Large Data Bases},
year = {1998},
number = {3},
volume = {7},
pages = {163--178},
url = {http://www.cs.berkeley.edu/~soumen/VLDB54_3.PDF},
abstract = {We explore how to organize large text databases hierarchically by
topic to aid better searching, browsing and filtering. Many
corpora, such as internet directories, digital libraries, and
patent databases are manually organized into topic hierarchies,
also called taxonomies. Similar to indices for relational data,
taxonomies make search and access more efficient. However, the
exponential growth in the volume of on-line textual information
makes it nearly impossible to maintain such taxonomic
organization for large, fast-changing corpora by hand. We
describe an automatic system that starts with a small sample of
the corpus in which topics have been assigned by hand, and then
updates the database with new documents as the corpus grows,
assigning topics to these new documents with high speed and
accuracy. To do this, we use techniques from statistical pattern
recognition to efficiently separate the feature words, or
discriminants, from thenoise words at each node of the taxonomy.
Using these, we build a multilevel classifier. At each node, this
classifier can ignore the large number of ``noise'' words in a
document. Thus, the classifier has a small model size and is very
fast. Owing to the use of context-sensitive features, the
classifier is very accurate. As a by-product, we can compute for
each document a set of terms that occur significantly more often
in it than in the classes to which it belongs. We describe the
design and implementation of our system, stressing how to exploit
standard, efficient relational operations like sorts and joins.
We report on experiences with the Reuters newswire benchmark, the
US patent database, and web document samples from Yahoo!. We
discuss applications where our system can improve searching and
filtering capabilities.},
}
@inProceedings{Chakrabarti98b,
author = {Soumen Chakrabarti and Byron E. Dom and Piotr Indyk},
title = {Enhanced hypertext categorization using hyperlinks},
booktitle = {Proceedings of SIGMOD-98, ACM International Conference on
Management of Data},
editor = {Laura M. Haas and Ashutosh Tiwary},
publisher = {{ACM} Press, New York, {US}},
address = {Seattle, {US}},
year = {1998},
pages = {307--318},
url = {http://www.acm.org/pubs/articles/proceedings/mod/276304/p307-chakrabarti/p307-chakrabarti.pdf},
abstract = {A major challenge in indexing unstructured hypertext databases is
to automatically extract meta-data that enables structured
searching using topic taxonomies, circumvents keyword ambiguity
and improves the quality of searching and profile-based routing
and filtering. Therefore, an accurate classifier is an essential
component of a hypertext database. Hyperlinks pose new problems
not addressed in the extensive text classification literature.
Links clearly contain high-quality semantic clues that are lost
upon a purely term-based classifier, but exploiting link
information is non-trivial because it is noisy. Naive use of
terms in the link neighborhood of a document can even degrade
accuracy. Our contribution is to propose robust statistical
models and a relaxation labeling technique for better
classification by exploiting link information in a small
neighborhood around documents. Our technique also adapts
gracefully to the fraction of neighboring documents having known
topics. We experimented with pre-classified samples from {{\sc
Yahoo!}}\ and the US Patent Database. We have developed a text
classifier that misclassifies only 13\% of the documents in the
Reuters benchmark; this is comparable to the best results ever
obtained. Our new classifier misclassified 36\% of the patents,
indicating that classifying hypertext can be more difficult than
classifying text. Naively using terms in neighboring documents
increased the error to 38\%; our hypertext classifier reduced it
to 21\%. Results with the Yahoo! sample were more dramatic: the
text classifier showed a 68\% error, whereas our hypertext
classifier reduced this to just 21\%.},
}
@article{Chakrabarti99,
author = {Soumen Chakrabarti and Byron E. Dom and S. Ravi Kumar and
Prabhakar Raghavan and Sridhar Rajagopalan and Andrew Tomkins and
David Gibson and Jon Kleinberg},
title = {Mining the {W}eb's link structure},
journal = {Computer},
year = {1999},
number = {8},
volume = {32},
pages = {60--67},
url = {http://dlib.computer.org/co/books/co1999/pdf/r8060.pdf},
abstract = {The Web is a hypertext body of approximately 300 million pages
that continues to grow at roughly a million pages per day. Page
variation is more prodigious than the data's raw scale: Taken as
a whole, the set of Web pages lacks a unifying structure and
shows far more authoring style and content variation than that
seen in traditional text-document collections. This level of
complexity makes an ``off-the-shelf'' database-management and
information-retrieval solution impossible. To date, index-based
search engines for the Web have been the primary tool by which
users search for information. Such engines can build giant
indices that let you quickly retrieve the set of all Web pages
containing a given word or string. Experienced users can make
effective use of such engines for tasks that can be solved by
searching for tightly constrained keywords and phrases. These
search engines are, however, unsuited for a wide range of equally
important tasks. In particular, a topic of any breadth will
typically contain several thousand or million relevant Web pages.
How then, from this sea of pages, should a search engine select
the correct ones-those of most value to the user?},
}
@article{Chakrabarti02,
author = {Soumen Chakrabarti and Shourya Roy and Mahesh Soundalgekar},
title = {Fast and accurate text classification via multiple linear
discriminant projections},
booktitle = {Proceedings of VLDB-02, 28th International Conference on Very
Large Data Bases},
publisher = {},
editor = {},
year = {2002},
address = {Hong Kong, {CN}},
pages = {658--669},
url = {http://www.vldb.org/conf/2002/S19P01.pdf},
abstract = {Support vector machines (SVMs) have shown superb performance for
text classification tasks. They are accurate, robust, and quick
to apply to test instances. Their only potential drawback is
their training time and memory requirement. For n training
instances held in memory, the best-known SVM implementations take
time proportional to n a , where a is typically between 1.8 and
2.1. SVMs have been trained on data sets with several thousand
instances, but Web directories today contain millions of
instances which are valuable for mapping billions of Web pages
into Yahoo!-like directories. We present SIMPL, a nearly
linear-time classification algorithm which mimics the strengths
of SVMs while avoiding the training bottleneck. It uses Fisher's
linear discriminant, a classical tool from statistical pattern
recognition, to project training instances to a carefully
selected low-dimensional subspace before inducing a decision tree
on the projected instances. SIMPL uses efficient sequential scans
and sorts, and is comparable in speed and memory scalability to
widely-used naive Bayes (NB) classifiers, but it beats NB
accuracy decisively. It not only approaches and sometimes exceeds
SVM accuracy, but also beats SVM running time by orders of
magnitude. While developing SIMPL, we also make a detailed
experimental analysis of the cache performance of SVMs.},
}
@inProceedings{Chai02,
author = {Kian M. Chai and Hwee T. Ng and Hai L. Chieu},
title = {Bayesian online classifiers for text classification and filtering},
booktitle = {Proceedings of SIGIR-02, 25th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng
and Kalervo J{\"{a}}rvelin},
publisher = {{ACM} Press, New York, {US}},
address = {Tampere, {FI}},
year = {2002},
pages = {97--104},
url = {http://doi.acm.org/10.1145/564376.564395},
abstract = {This paper explores the use of Bayesian online classifiers to
classify text documents. Empirical results indicate that these
classifiers are comparable with the best text classification
systems. Furthermore, the online approach offers the advantage of
continuous learning in the batch-adaptive text filtering task.},
}
@inProceedings{Chandrinos00,
author = {Konstantinos V. Chandrinos and Ion Androutsopoulos and Georgios
Paliouras and Constantine D. Spyropoulos},
title = {Automatic {W}eb Rating: Filtering Obscene Content on the {W}eb},
booktitle = {Proceedings of ECDL-00, 4th European Conference on Research and
Advanced Technology for Digital Libraries},
editor = {Jos{\'e} L. Borbinha and Thomas Baker},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 1923},
year = {2000},
address = {Lisbon, {PT}},
pages = {403-406},
url = {http://link.springer.de/link/service/series/0558/papers/1923/19230403.pdf},
abstract = {We present a method to detect automatically pornographic content
on the Web. Our method combines techniques from language
engineering and image analysis within a machine-learning
framework. Experimental results show that it achieves nearly
perfect performance on a set of hard cases.},
}
@inProceedings{Chen01,
author = {Chien Chin Chen and Chang Chen, Meng and Yeali Sun},
title = {{PVA}: A Self-Adaptive Personal View Agent},
booktitle = {Proceedings of KDD-01, 7th ACM SIGKDD International Conferece on
Knowledge Discovery and Data Mining},
editor = {Foster Provost and Ramakrishnan Srikant},
year = {2001},
pages = {257--262},
publisher = {{ACM} Press, New York, {US}},
address = {San Francisco, {US}},
url = {http://doi.acm.org/10.1145/502512.502548},
abstract = {In this paper, we present PVA, an adaptive personal view
information agent system to track, learn and manage, user's
interests in Internet documents. When user's interests change,
PVA, in not only the contents, but also in the structure of user
profile, is modified to adapt to the changes. Experimental
results show that modulating the structure of user profile does
increase the accuracy of personalization systems.},
}
@article{Chen02,
author = {Chien Chin Chen and Chang Chen, Meng and Yeali Sun},
title = {{PVA}: A Self-Adaptive Personal View Agent},
journal = {Journal of Intelligent Information Systems},
year = {2002},
note = {Special Issue on Automated Text Categorization},
volume = {18},
number = {2/3},
pages = {173--194},
url = {http://www.wkap.nl/article.pdf?391245},
abstract = {In this paper, we present PVA, an adaptive personal view
information agent system for tracking, learning and managing user
interests in Internet documents. PVA consists of three parts: a
{\it proxy}, {\it personal view constructor}, and {\it personal
view maintainer}. The proxy logs the user's activities and
extracts the user's interests without user intervention. The
personal view constructor mines user interests and maps them to a
class hierarchy (i.e., personal view). The personal view
maintainer synchronizes user interests and the personal view
periodically. When user interests change, in PVA, not only the
contents, but also the structure of the user profile are modified
to adapt to the changes. In addition, PVA considers the aging
problem of user interests. The experimental results show that
modulating the structure of the user profile increases the
accuracy of a personalization system.},
}
@inProceedings{Chen00,
author = {Hao Chen and Susan T. Dumais},
title = {Bringing order to the {W}eb: automatically categorizing search
results},
booktitle = {Proceedings of CHI-00, ACM International Conference on Human
Factors in Computing Systems},
publisher = {{ACM} Press, New York, {US}},
editor = {},
year = {2000},
address = {Den Haag, {NL}},
pages = {145--152},
url = {http://www.acm.org/pubs/articles/proceedings/chi/332040/p145-chen/p145-chen.pdf},
abstract = {We developed a user interface that organizes Web search results
into hierarchical categories. Text classification algorithms were
used to automatically classify arbitrary search results into an
existing category structure on-the-fly. A user study compared our
new category interface with the typical ranked list interface of
search results. The study showed that the category interface is
superior both in objective and subjective measures. Subjects
liked the category interface much better than the list interface,
and they were 50\% faster at finding information that was
organized into categories. Organizing search results allows users
to focus on items in categories of interest rather than having to
browse through all the results sequentially.},
}
@inProceedings{Chen00a,
author = {Hao Chen and Tin Kam Ho},
title = {Evaluation of Decision Forests on Text Categorization},
booktitle = {Proceedings of the 7th SPIE Conference on Document Recognition
and Retrieval},
publisher = {{SPIE} {}-{} The International Society for Optical Engineering},
editor = {Daniel P. Lopresti and Jiangying Zhou},
year = {2000},
address = {San Jose, {US}},
pages = {191--199},
url = {http://cm.bell-labs.com/who/tkh/papers/textcat.ps.gz},
abstract = {Text categorization is useful for indexing documents for
information retrieval, filtering parts for document
understanding, and summarizing contents of documents of special
interests. We describe a text categorization task and an
experiment using documents from the Reuters and OHSUMED
collections. We applied the Decision Forest classifier and
compared its accuracies to those of C4.5 and kNN classifiers,
using both category dependent and category independent term
selection schemes. It is found that Decision Forest outperforms
both C4.5 and kNN in all cases, and that category dependent term
selection yields better accuracies. Performances of all three
classifiers degrade from the Reuters collection to the OHSUMED
collection, but Decision Forest remains to be superior.},
}
@inProceedings{Cheng01,
author = {Cheng, Chun-Hung and Jian Tang and Ada Wai-Chee and Irwin King},
title = {Hierarchical Classification of Documents with Error Control},
booktitle = {Proceedings of PAKDD-01, 5th Pacific-Asia Conferenece on
Knowledge Discovery and Data Mining},
editor = {David Cheung and Qing Li and Graham Williams},
year = {2001},
publisher = {Springer Verlag, Heidelberg, {DE}},
address = {Hong Kong, {CN}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2035},
pages = {433--443},
url = {http://link.springer-ny.com/link/service/series/0558/papers/2035/20350433.pdf},
abstract = {Classification is a function that matches a new object with one
of the predefined classes. Document classification is
characterized by the large number of attributes involved in the
objects (documents). The traditional method of building a single
classifier to do all the classification work would incur a high
overhead. Hierarchical classifi- cation is a more efficient
method - instead of a single classifier, we use a set of
classifiers distributed over a class taxonomy, one for each
internal node. However, once a misclassification occurs at a high
level class, it may result in a class that is far apart from the
correct one. An existing approach to coping with this problem
requires terms also to be arranged hierarchically. In this paper,
instead of overhauling the classifier itself, we propose
mechanisms to detect misclassification and take appropriate
actions. We then discuss an alternative that masks the
misclassification based on a well known software fault tolerance
technique. Our experiments show our algorithms represent a good
trade-off between speed and accuracy in most applications.},
}
@inProceedings{Cheong02,
author = {Cheong Fung, Gabriel P. and Jeffrey X. Yu and Hongjun Lu},
title = {Discriminative Category Matching: Efficient Text Classification
for Huge Document Collections},
booktitle = {Proceedings of ICDM-02, 2nd IEEE International Conference on Data
Mining},
editor = {},
publisher = {{IEEE} Computer Society Press, Los Alamitos, {US}},
address = {Maebashi City, {JP}},
year = {2002},
pages = {187--194},
url = {http://dlib.computer.org/conferen/icdm/1754/pdf/17540187.pdf},
abstract = {With the rapid growth of textual information available on the
Internet, having a good model for classifying and managing
documents automatically is undoubtly important. When more
documents are archived, new terms, new concepts and concept-drift
will frequently appear. Without a doubt, updating the
classification model frequently rather than using the old model
for a very long period is absolutely essential. Here, the
challenges are: a) obtain a high accuracy classification model;
b) consume low computational time for both model training and
operation; and c) occupy low storage space. However, none of the
existing classification approaches could achieve all of these
requirements. In this paper, we propose a novel text
classification approach, called Discriminative Category Matching,
which could achieve all of the stated characteristics. Extensive
experiments using two benchmarks and a large real-life collection
are conducted. The encouraging results indicated that our
approach is hignhly feasible.},
}
@article{Chouchoulas01,
author = {Alexios Chouchoulas and Qiang Shen},
title = {Rough set-aided keyword reduction for text categorization},
journal = {Applied Artificial Intelligence},
pages = {843--873},
year = {2001},
volume = {15},
number = {9},
url = {},
abstract = {The volume of electronically stored information increases
exponentially as the state of the art progresses. Automated
information filtering (IF) and information retrieval (IR) systems
are therefore acquiring rapidly increasing prominence. However,
such systems sacrifice efficiency to boost effectiveness. Such
systems typically have to cope with sets of rectors of many tens
of thousands of dimensions. Rough set (RS) theory can be applied
to reducing the dimensionality of data used in IF/IR tasks, by
providing a measure of the information content of datasets with
respect to a given classification. This can aid IF/IR systems
that rely on the acquisition of large numbers of term weights or
other measures of relevance. This article investigates the
applicability of RS theory to the IF/IR application domain and
compares this applicability with respect to various existing TC
techniques. The ability, of the approach to generalize, given a
minimum of training data is also addressed. The background of RS
theory is presented, with an illustrative example to demonstrate
the operation of the RS-based dimensionality reduction. A modular
system is proposed which allows the integration of this technique
with a large variety of different IF/IR approaches. The example
application, categorization of E-mail messages, is described.
Systematic experiments and their results are reported and
analyzed.},
}
@inProceedings{Chuang00,
author = {Wesley T. Chuang and Asok Tiyyagura and Jihoon Yang and Giovanni
Giuffrida},
title = {A Fast Algorithm for Hierarchical Text Classification},
booktitle = {Proceedings of DaWaK-00, 2nd International Conference on Data
Warehousing and Knowledge Discovery},
editor = {Yahiko Kambayashi and Mukesh Mohania and A.Min Tjoa},
year = {2000},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 1874},
address = {London, {UK}},
pages = {409--418},
url = {http://www.cs.iastate.edu/~yang/Papers/dawak00.ps},
abstract = {Text classification is becoming more important with the
proliferation of the Internet and the huge amount of data it
transfers. We present an efficient algorithm for text
classification using hierarchical classifiers based on a concept
hierarchy. The simple TFIDF classifier is chosen to train sample
data and to classify other new data. Despite its simplicity,
results of experiments on Web pages and TV closed captions
demonstrate high classification accuracy. Application of feature
subset selection techniques improves the performance. Our
algorithm is computationally efficient being bounded by O(n log
n) forn samples.},
}
@inProceedings{Ciravegna99,
author = {Fabio Ciravegna and Alberto Lavelli and Nadia Mana and Johannes
Matiasek and Luca Gilardoni and Silvia Mazza and William J. Black
and Fabio Rinaldi},
title = {{FACILE}: Classifying Texts Integrating Pattern Matching and
Information Extraction},
booktitle = {Proceedings of IJCAI-99, 16th International Joint Conference on
Artificial Intelligence},
editor = {Thomas Dean},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
year = {1999},
pages = {890--895},
address = {Stockholm, {SE}},
url = {http://ecate.itc.it:1024/lavelli/lavelli-papers/IJCAI99/ijcai99.ps.gz},
abstract = {Successfully managing information means being able to find
relevant new information and to correctly integrate it with
pre-existing knowledge. Much information is nowadays stored as
multilingual textual data; therefore advanced classification
systems are currently considered as strategic components for
effective knowledge management. We describe an experience
integrating different innovative AI technologies such as
hierarchical pattern matching and information extraction to
provide flexible multilingual classification adaptable to user
needs. Pattern matching produces fairly accurate and fast
categorisation over a large number of classes, while information
extraction provides fine-grained classification for a reduced
number of classes. The resulting system was adopted by the main
Italian financial news agency providing a pay-to-view service.},
}
@inProceedings{Clack97,
author = {Chris Clack and Johnny Farringdon and Peter Lidwell and Tina Yu},
title = {Autonomous document classification for business},
editor = {W. Lewis Johnson},
publisher = {{ACM} Press, New York, {US}},
booktitle = {Proceedings of the 1st International Conference on Autonomous
Agents},
address = {Marina Del Rey, {US}},
year = {1997},
pages = {201--208},
url = {http://www.acm.org/pubs/articles/proceedings/ai/267658/p201-clack/p201-clack.pdf},
abstract = {With the continuing exponential growth of the Internet and the
more recent growth of business Intranets, the commercial world is
becoming increasingly aware of the problem of electronic
information overload. This has encouraged interest in developing
agents/softbots that can act as electronic personal assistants
and can develop and adapt representations of users information
needs, commonly known as profiles. As the result of collaborative
research with Friends of the Earth, an environmental issues
campaigning organisation, we have developed a general purpose
information classification agent architecture and have applied it
to the problem of document classification and routing.
Collaboration with Friends of the Earth allows us to test our
ideas in a non-academic context involving high volumes of
documents. We use the technique of genetic programming (GP),
(Koza and Rice 1992), to evolve classifying agents. This is a
novel approach for document classification, where each agent
evolves a parse-tree representation of a user's particular
information need. The other unusual features of our research are
the longevity of our agents and the fact that they undergo a
continual training process; feedback from the user enables the
agent to adapt to the user's long-term information requirements.},
}
@inProceedings{Cohen95,
author = {William W. Cohen},
title = {Text categorization and relational learning},
booktitle = {Proceedings of ICML-95, 12th International Conference on Machine
Learning},
editor = {Armand Prieditis and Stuart J. Russell},
address = {Lake Tahoe, {US}},
year = {1995},
pages = {124--132},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {http://www.research.whizbang.com/~wcohen/postscript/ml-95-ir.ps},
abstract = {We evaluate the first order learning system FOIL on a series of
text categorization problems. It is shown that FOIL usually forms
classifiers with lower error rates and higher rates of precision
and recall with a relational encoding than with a propositional
encoding. We show that FOIL's performance can be improved by
relation selection, a first order analog of feature selection.
Relation selection improves FOIL's performance as measured by any
of recall, precision, F-measure, or error rate. With an
appropriate level of relation selection, FOIL appears to be
competitive with or superior to existing propositional
techniques.},
}
@inCollection{Cohen95a,
author = {William W. Cohen},
title = {Learning to classify {E}nglish text with {ILP} methods},
booktitle = {Advances in inductive logic programming},
editor = {De Raedt, Luc},
publisher = {{IOS} Press},
address = {Amsterdam, {NL}},
pages = {124--143},
year = {1995},
url = {http://www.research.whizbang.com/~wcohen/postscript/ilp.ps},
abstract = {Text categorization is the task of classifying text into one of
several predefined categories. In this paper we will evaluate the
effectiveness of several ILP methods for text categorization, and
also compare them to their propositional analogs. The methods
considered are FOIL, the propositional rule-learning system
RIPPER, and a first-order version of RIPPER called FLIPPER. We
show that the benefit of using a first-order representation in
this domain is relatively modest; in particular, the performance
difference between FLIPPER and FOIL and their propositional
counterparts is quite small, compared to the differences between
FOIL and FLIPPER. However, a first-order representation seems to
be advantageous when high-precision classifiers are desirable.},
}
@inProceedings{Cohen96a,
author = {William W. Cohen and Yoram Singer},
title = {Context-sensitive learning methods for text categorization},
booktitle = {Proceedings of SIGIR-96, 19th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Hans-Peter Frei and Donna Harman and Peter Sch{\"{a}}uble and
Ross Wilkinson},
publisher = {{ACM} Press, New York, {US}},
year = {1996},
address = {Z{\"{u}}rich, {CH}},
pages = {307--315},
note = {An extended version appears as~\cite{Cohen99}},
url = {http://www.research.whizbang.com/~wcohen/postscript/sigir-96.ps},
abstract = {Two machine learning algorithms, RIPPER and sleeping experts for
phrases, are evaluated on a number of large text categorization
problems. These algorithms both construct classifiers that allow
the ``context'' of a word w to affect how (or even whether) the
presence or absence of w will contribute to a classification.
However, RIPPER and sleeping experts differ radically in many
other respects. Differences include: different notions as to what
constitutes a context; different ways of combining contexts to
construct a classifier; different methods to search for a
combination of contexts; and different criteria as to what
contexts should be included in such a combination. In spite of
these differences, both RIPPER and sleeping experts perform
extremely well across a wide variety of categorization problems,
generally outperforming previously applied learning methods. We
view this result as a confirmation of the usefulness of
classifiers that represent contextual information.},
}
@inProceedings{Cohen98,
author = {William W. Cohen and Haym Hirsh},
title = {Joins that generalize: text classification using {{\sc Whirl}}},
booktitle = {Proceedings of KDD-98, 4th International Conference on Knowledge
Discovery and Data Mining},
editor = {Rakesh Agrawal and Paul E. Stolorz and Gregory Piatetsky-Shapiro},
publisher = {{AAAI} Press, Menlo Park, {US}},
year = {1998},
address = {New York, {US}},
pages = {169--173},
url = {http://www.research.whizbang.com/~wcohen/postscript/kdd-98.ps},
abstract = {WHIRL is an extension of relational databases that can perform
``soft joins'' based on the similarity of textual identifiers;
these soft joins extend the traditional operation of joining
tables based on the equivalence of atomic values. This paper
evaluates WHIRL on a number of inductive classification tasks
using data from the World Wide Web. We show that although WHIRL
is designed for more general similarity-based reasoning tasks, it
is competitive with mature inductive classification systems on
these classification tasks. In particular, WHIRL generally
achieves lower generalization error than C4.5, RIPPER, and
several nearest-neighbor methods. WHIRL is also fast-up to 500
times faster than C4.5 on some benchmark problems. We also show
that WHIRL can be efficiently used to select from a large pool of
unlabeled items those that can be classified correctly with high
confidence.},
}
@article{Cohen99,
author = {William W. Cohen and Yoram Singer},
title = {Context-sensitive learning methods for text categorization},
journal = {{ACM} Transactions on Information Systems},
year = {1999},
volume = {17},
number = {2},
pages = {141--173},
url = {http://www.acm.org/pubs/articles/journals/tois/1999-17-2/p141-cohen/p141-cohen.pdf},
abstract = {Two recently implemented machine-learning algorithms, RIPPER and
sleeping-experts for phrases, are evaluated on a number of large
text categorization problems. These algorithms both construct
classifiers that allow the ``context'' of a word w to affect how
(or even whether) the presence or absence of w will contribute to
a classification. However, RIPPER and sleeping-experts differ
radically in many other respects: differences include different
notions as to what constitutes a context, different ways of
combining contexts to construct a classifier, different methods
to search for a combination of contexts, and different criteria
as to what contexts should be included in such a combination. In
spite of these differences, both RIPPER and sleeping-experts
perform extremely well across a wide variety of categorization
problems, generally outperforming previously applied learning
methods. We view this result as a confirmation of the usefulness
of classifiers that represent contextual information.},
}
@inProceedings{Crammer02,
author = {Koby Crammer and Yoram Singer},
title = {A New Family of Online Algorithms for Category Ranking},
booktitle = {Proceedings of SIGIR-02, 25th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng
and Kalervo J{\"{a}}rvelin},
publisher = {{ACM} Press, New York, {US}},
address = {Tampere, {FI}},
year = {2002},
pages = {151--158},
url = {http://doi.acm.org/10.1145/564376.564404},
abstract = {We describe a new family of topic-ranking algorithms for
multi-labeled documents. The motivation for the algorithms stems
from recent advances in online learning algorithms. The
algorithms we present are simple to implement and are time and
memory efficient. We evaluate the algorithms on the Reuters-21578
corpus and the new corpus released by Reuters in 2000. On both
corpora the algorithms we present outperform adaptations to
topic-ranking of Rocchio's algorithm and the Perceptron
algorithm. We also outline the formal analysis of the algorithm
in the mistake bound model. To our knowledge, this work is the
first to report performance results with the entire new Reuters
corpus.},
}
@inProceedings{Craven98,
author = {Mark Craven and Dan DiPasquo and Dayne Freitag and Andrew K.
McCallum and Tom M. Mitchell and Kamal Nigam and Se{\'{a}}n
Slattery},
title = {Learning to extract symbolic knowledge from the {W}orld {W}ide
{W}eb},
booktitle = {Proceedings of AAAI-98, 15th Conference of the American
Association for Artificial Intelligence},
publisher = {{AAAI} Press, Menlo Park, {US}},
year = {1998},
pages = {509--516},
address = {Madison, {US}},
note = {An extended version appears as~\cite{Craven00}},
url = {http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-11/www/wwkb/overview-aaai98.ps.gz},
abstract = {The World Wide Web is a vast source of information accessible to
computers, but understandable only to humans. The goal of the
research described here is to automatically create a computer
understandable world wide knowledge base whose content mirrors
that of the World Wide Web. Such a knowledge base would enable
much more effective retrieval of Web information, and promote new
uses of the Web to support knowledge-based inference and problem
solving. Our approach is to develop a trainable information
extraction system that takes two inputs: an ontology defining the
classes and relations of interest, and a set of training data
consisting of labeled regions of hypertext representing instances
of these classes and relations. Given these inputs, the system
learns to extract information from other pages and hyperlinks on
the Web. This paper describes our general approach, several
machine learning algorithms for this task, and promising initial
results with a prototype system.},
}
@article{Craven00,
author = {Mark Craven and Dan DiPasquo and Dayne Freitag and Andrew K.
McCallum and Tom M. Mitchell and Kamal Nigam and Se{\'{a}}n
Slattery},
title = {Learning to Construct Knowledge Bases from the {W}orld {W}ide
{W}eb},
journal = {Artificial Intelligence},
volume = {118},
number = {1/2},
year = {2000},
pages = {69--113},
url = {http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-11/www/wwkb/overview-aij99.ps.gz},
abstract = {The World Wide Web is a vast source of information accessible to
computers, but understandable only to humans. The goal of the
research described here is to automatically create a computer
understandable knowledge base whose content mirrors that of the
World Wide Web. Such a knowledge base would enable much more
effective retrieval of Web information, and promote new uses of
the Web to support knowledge-based inference and problem solving.
Our approach is to develop a trainable information extraction
system that takes two inputs. The first is an ontology that
defines the classes (e.g., company, person, employee, product)
and relations (e.g., employed_by, produced_by) of interest when
creating the knowledge base. The second is a set of training data
consisting of labeled regions of hypertext that represent
instances of these classes and relations. Given these inputs, the
system learns to extract information from other pages and
hyperlinks on the Web. This article describes our general
approach, several machine learning algorithms for this task, and
promising initial results with a prototype system that has
created a knowledge base describing university people, courses,
and research projects.},
}
@article{Craven01,
author = {Craven, Mark and Slattery, Se{\'{a}}n},
title = {Relational learning with statistical predicate invention: Better
models for hypertext},
journal = {Machine Learning},
pages = {97--119},
year = {2001},
volume = {43},
number = {1/2},
url = {http://www.wkap.nl/article.pdf?321079},
abstract = {We present a new approach to learning hypertext classifiers that
combines a statistical text-learning method with a relational
rule learner. This approach is well suited to learning in
hypertext domains because its statistical component allows it to
characterize text in terms of word frequencies, whereas its
relational component is able to describe how neighboring
documents are related to each other by hyperlinks that connect
them. We evaluate our approach by applying it to tasks that
involve learning definitions for (i) classes of pages, (ii)
particular relations that exist between pairs of pages, and (iii)
locating a particular class of information in the internal
structure of pages. Our experiments demonstrate that this new
approach is able to learn more accurate classifiers than either
of its constituent methods alone.},
}
@article{Creecy92,
author = {Robert M. Creecy and Brij M. Masand and Stephen J. Smith and
David L. Waltz},
title = {Trading {MIPS} and memory for knowledge engineering: classifying
census returns on the {C}onnection {M}achine},
journal = {Communications of the {ACM}},
volume = {35},
number = {8},
year = {1992},
pages = {48--63},
url = {http://www.acm.org/pubs/articles/journals/cacm/1992-35-8/p48-creecy/p48-creecy.pdf},
}
@inProceedings{Cristianini01,
author = {Nello Cristianini and John Shawe-Taylor and Huma Lodhi},
title = {Latent Semantic Kernels},
booktitle = {Proceedings of ICML-01, 18th International Conference on Machine
Learning},
editor = {Carla Brodley and Andrea Danyluk},
address = {Williams College, {US}},
year = {2001},
pages = {66--73},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {},
abstract = {Kernel methods like Support Vector Machines have successfully
been used for text categorization. A standard choice of kernel
function has been the inner product between the vector-space
representation of two documents, in analogy with classical
information retrieval (IR) approaches. Latent Semantic Indexing
(LSI) has been successfully used for IR purposes, as a technique
for capturing semantic relations between terms and inserting them
into the similarity measure between two documents. One of its
main drawbacks, in IR, is its computational cost. In this paper
we describe how the LSI approach can be implemented in a
kernel-defined feature space. We provide experimental results
demonstrating that the approach can significantly improve
performance, and that it does not impair it.},
}
@inCollection{Cristianini01a,
author = {Huma Lodhi and John Shawe-Taylor and Nello Cristianini and
Christopher J. Watkins},
title = {Discrete Kernels for Text Categorisation},
booktitle = {Advances in Neural Information Processing Systems},
editor = {Todd K. Leen and Thomas G. Dietterich and Volker Tresp},
volume = {13},
year = {2001},
pages = {563--569},
publisher = {{MIT} Press, Cambridge, {MA}},
url = {http://www.support-vector.net/papers/LodhiShawe-TaylorCristianiniWatkins_ps.ps},
abstract = {},
}
@article{Cristianini02,
author = {Nello Cristianini and John Shawe-Taylor and Huma Lodhi},
title = {Latent Semantic Kernels},
journal = {Journal of Intelligent Information Systems},
year = {2002},
note = {Special Issue on Automated Text Categorization},
volume = {18},
number = {2/3},
pages = {127--152},
url = {http://www.wkap.nl/article.pdf?391243},
abstract = {Kernel methods like Support Vector Machines have successfully
been used for text categorization. A standard choice of kernel
function has been the inner product between the vector-space
representation of two documents, in analogy with classical
information retrieval (IR) approaches. Latent Semantic Indexing
(LSI) has been successfully used for IR purposes as a technique
for capturing semantic relations between terms and inserting them
into the similarity measure between two documents. One of its
main drawbacks, in IR, is its computational cost. In this paper
we describe how the LSI approach can be implemented in a
kernel-defined feature space. We provide experimental results
demonstrating that the approach can significantly improve
performance, and that it does not impair it.},
}
@inProceedings{Dalessio98,
author = {Stephen D'Alessio and Keitha Murray and Robert Schiaffino and
Aaron Kershenbaum},
title = {Category Levels in Hierarchical Text Categorization},
booktitle = {Proceedings of EMNLP-98, 3rd Conference on Empirical Methods in
Natural Language Processing},
year = {1998},
publisher = {Association for Computational Linguistics, Morristown, {US}},
editor = {},
pages = {},
address = {Granada, {ES}},
url = {http://www.iona.edu/cs/FacultyPublications/emnlpf.pdf},
abstract = {We consider the problem of assigning level numbers (weights) to
hierarchically organized categories during the process of text
categorization. These levels control the ability of the
categories to attract documents during the categorization
process. The levels are adjusted in order to obtain a balance
between recall and precision for each category. If a category's
recall exceeds its precision, the category is too strong and its
level is reduced. Conversely, a category's level is increased to
strengthen it if its precision exceeds its recall. The
categorization algorithm used is a supervised learning procedure
that uses a linear classifier based on the category levels. We
are given a set of categories, organized hierarchically. We are
also given a training corpus of documents already placed in one
or more categories. From these, we extract vocabulary, words that
appear with high frequency within a given category,
characterizing each subject area. Each node's vocabulary is
filtered and its words assigned weights with respect to the
specific category. Then, test documents are scanned and
categories ranked based on the presence of vocabulary terms.
Documents are assigned to categories based on these rankings. We
demonstrate that precision and recall can be significantly
improved by solving the categorization problem taking hierarchy
into account. Specifically, we show that by adjusting the
category levels in a principled way, that precision can be
significantly improved, from 84\% to 91\%, on the much-studied
Reuters-21578 corpus organized in a three-level hierarchy of
categories.},
}
@inProceedings{Dalessio00,
author = {Stephen D'Alessio and Keitha Murray and Robert Schiaffino and
Aaron Kershenbaum},
title = {The effect of using Hierarchical classifiers in Text
Categorization},
booktitle = {Proceeding of RIAO-00, 6th International Conference ``Recherche
d'Information Assistee par Ordinateur''},
editor = {},
address = {Paris, {FR}},
year = {2000},
pages = {302--313},
url = {http://www.iona.edu/cs/FacultyPublications/riao2000New.pdf},
abstract = {Given a set of categories, with or without a preexisting
hierarchy among them, we consider the problem of assigning
documents to one or more of these categories from the point of
view of a hierarchy with more or less depth. We can choose to
make use of none, part or all of the hierarchical structure to
improve the categorization effectiveness and efficiency. It is
possible to create additional hierarchy among the categories. We
describe a procedure for generating a hierarchy of classifiers
that model the hierarchy structure. We report on computational
experience using this procedure. We show that judicious use of a
hierarchy can significantly improve both the speed and
effectiveness of the categorization process. Using the
Reuters-21578 corpus, we obtain an improvement in running time of
over a factor of three and a 5\% improvement in F-measure.},
}
@article{Dasigi01,
author = {Dasigi, Venu and Mann, Reinhold C. and Protopopescu, Vladimir A.},
title = {Information fusion for text classification: an experimental
comparison},
journal = {Pattern Recognition},
year = {2001},
volume = {34},
number = {12},
pages = {2413--2425},
url = {},
abstract = {This article reports on our experiments and results on the
effectiveness of different feature sets and information fusion
from some combinations of them in classifying free text documents
into a given number of categories. We use different feature sets
and integrate neural network learning into the method. The
feature sets are based on the ``latent semantics'' of a reference
library ‹ a collection of documents adequately representing the
desired concepts. We found that a larger reference library is not
necessarily better. Information fusion almost always gives better
results than the individual constituent feature sets, with
certain combinations doing better than the others.},
}
@inProceedings{Debole03,
author = {Franca Debole and Fabrizio Sebastiani},
title = {Supervised term weighting for automated text categorization},
year = {2003},
booktitle = {Proceedings of SAC-03, 18th ACM Symposium on Applied Computing},
address = {Melbourne, {US}},
publisher = {{ACM} Press, New York, {US}},
pages = {784--788},
url = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/SAC03b.pdf},
abstract = {The construction of a text classifier usually involves (i) a
phase of \emph{term selection}, in which the most relevant terms
for the classification task are identified, (ii) a phase of
\emph{term weighting}, in which document weights for the selected
terms are computed, and (iii) a phase of \emph{classifier
learning}, in which a classifier is generated from the weighted
representations of the training documents. This process involves
an activity of {\em supervised learning}, in which information on
the membership of training documents in categories is used.
Traditionally, supervised learning enters only phases (i) and
(iii). In this paper we propose instead that learning from
training data should also affect phase (ii), i.e.\ that
information on the membership of training documents to categories
be used to determine term weights. We call this idea
\emph{supervised term weighting} (STW). As an example, we propose
a number of ``supervised variants'' of $tfidf$ weighting,
obtained by replacing the $idf$ function with the function that
has been used in phase (i) for term selection. We present
experimental results obtained on the standard
\textsf{Reuters-21578} benchmark with one classifier learning
method (support vector machines), three term selection functions
(information gain, chi-square, and gain ratio), and both local
and global term selection and weighting.},
}
@inProceedings{deBuenaga97,
author = {De Buenaga Rodr{\'{\i}}guez, Manuel and G{\'o}mez-Hidalgo,
Jos{\'e} Mar{\'{\i}}a and D{\'{\i}}az-Agudo, Bel{\'e}n},
title = {Using {WordNet} to Complement Training Information in Text
Categorization},
booktitle = {Proceedings of RANLP-97, 2nd International Conference on Recent
Advances in Natural Language Processing},
publisher = {},
editor = {Ruslan Milkov and Nicolas Nicolov and Nilokai Nikolov},
address = {Tzigov Chark, {BL}},
pages = {},
year = {1997},
url = {http://xxx.unizar.es/ps/cmp-lg/9709007},
abstract = {Automatic Text Categorization (TC) is a complex and useful task
for many natural language applications, and is usually performed
through the use of a set of manually classified documents, a
training collection. We suggest the utilization of additional
resources like lexical databases to increase the amount of
information that TC systems make use of, and thus, to improve
their performance. Our approach integrates WordNet information
with two training approaches through the Vector Space Model. The
training approaches we test are the Rocchio (relevance feedback)
and the Widrow-Hoff (machine learning) algorithms. Results
obtained from evaluation show that the integration of WordNet
clearly outperforms training approaches, and that an integrated
technique can effectively address the classification of low
frequency categories.},
}
@inProceedings{deLima98,
author = {De Lima, Luciano R. and Laender, Alberto H. and Ribeiro-Neto,
Berthier A.},
title = {A hierarchical approach to the automatic categorization of
medical documents},
booktitle = {Proceedings of CIKM-98, 7th ACM International Conference on
Information and Knowledge Management},
publisher = {{ACM} Press, New York, {US}},
editor = {Georges Gardarin and James C. French and Niki Pissinou and Kia
Makki and Luc Bouganim},
year = {1998},
address = {Bethesda, {US}},
pages = {132--139},
url = {http://www.acm.org/pubs/articles/proceedings/cikm/288627/p132-de_lima/p132-de_lima.pdf},
abstract = {},
}
@article{deVel01,
author = {Olivier Y. De Vel and Alison Anderson and Malcolm Corney and
George M. Mohay},
title = {Mining Email Content for Author Identification Forensics},
journal = {{SIGMOD} Record},
year = {2001},
volume = {30},
number = {4},
pages = {55--64},
url = {},
abstract = {We describe an investigation into e-mail content mining for
author identification, or authorship attribution, for the purpose
of forensic investigation. We focus our discussion on the ability
to discriminate between authors for the case of both aggregated
e-mail topics as well as across different email topics. An
extended set of e-mail document features including structural
characteristics and linguistic patterns were derived and,
together with a Support Vector Machine learning algorithm, were
used for mining the e-mail content. Experiments using a number of
e-mail documents generated by different authors on a set of
topics gave promising results for both aggregated and multi-topic
author categorisation.},
}
@inProceedings{Diaz98,
author = {D{\'{\i}}az Esteban, Alberto and De Buenaga Rodr{\'{\i}}guez,
Manuel and Ure{\~n}a L{\'o}pez, L. Alfonso and Garc{\'{\i}}a
Vega, Manuel},
title = {Integrating Linguistic Resources in an Uniform Way for Text
Classification Tasks},
booktitle = {Proceedings of LREC-98, 1st International Conference on Language
Resources and Evaluation},
publisher = {},
editor = {Antonio Rubio and Natividad Gallardo and Rosa Castro and Antonio
Tejada},
address = {Grenada, {ES}},
pages = {1197--1204},
year = {1998},
url = {http://www.esi.uem.es/laboratorios/sinai/postscripts/lrec98.ps},
abstract = {Applications based on automatic text classification tasks, like
text categorization (TC), word sense disambiguation (WSD), text
filtering or routing, monolingual or multilingual information
retrieval, and text summarization could obtain serious
improvements by integrating linguistic resources in the current
methods. We present an approach using the Vector Space Model to
integrate two different kind of resources: a lexical database and
training collections, in text content analysis tasks. The
training approaches we test are the Rocchio (relevance feedback)
and the Widrow-Hoff (machine learning) algorithms and WordNet as
the lexical database. We have delevoped experimental systems for
TC and WSD. Results obtained from evaluation show that the
integration of WordNet can outperform approaches based only on
training.},
}
@article{Diederich03,
author = {Diederich, Joachim and Kindermann, J{\"{o}}rg and Leopold, Edda
and Paa{{\ss}}, Gerhard},
title = {Authorship Attribution with Support Vector Machines},
journal = {Applied Intelligence},
year = {2003},
volume = {19},
number = {1/2},
pages = {109--123},
url = {http://ipsapp007.kluweronline.com/content/getfile/4504/36/6/abstract.htm
},
abstract = {In this paper we explore the use of text-mining methods for the
identification of the author of a text. We apply the support
vector machine (SVM) to this problem, as it is able to cope with
half a million of inputs it requires no feature selection and can
process the frequency vector of all words of a text. We performed
a number of experiments with texts from a German newspaper. With
nearly perfect reliability the SVM was able to reject other
authors and detected the target author in 6080\% of the cases.
In a second experiment, we ignored nouns, verbs and adjectives
and replaced them by grammatical tags and bigrams. This resulted
in slightly reduced performance. Author detection with SVMs on
full word forms was remarkably robust even if the author wrote
about different topics.},
}
@inProceedings{Dorre99,
author = {Jochen D{\"o}rre and Peter Gerstl and Roland Seiffert},
title = {Text mining: finding nuggets in mountains of textual data},
booktitle = {Proceedings of KDD-99, 5th ACM International Conference on
Knowledge Discovery and Data Mining},
publisher = {{ACM} Press, New York, {US}},
editor = {},
year = {1999},
address = {San Diego, {US}},
pages = {398--401},
url = {http://www.acm.org/pubs/articles/proceedings/ai/312129/p398-dorre/p398-dorre.pdf},
abstract = {Text mining applies the same analytical functions of data mining
to the domain of textual information, relying on sophisticated
text analysis techniques that distill information from free-text
documents. IBM¹s Intelligent Miner for Text provides the
necessary tools to unlock the business information that is
''trapped'' in email, insurance claims, news feeds, or other
document repositories. It has been successfully applied in
analyzing patent portfolios, customer complaint letters, and even
competitors¹ Web pages. After defining our notion of ``text
mining'', we focus on the differences between text and data
mining and describe in some more detail the unique technologies
that are key to successful text mining.},
}
@inProceedings{Dagan96,
author = {Dagan, Ido and Feldman, Ronen and Hirsh, Haym},
title = {Keyword-based browsing and analysis of large document sets},
booktitle = {Proceedings of SDAIR-96, 5th Annual Symposium on Document
Analysis and Information Retrieval},
publisher = {},
editor = {},
year = {1996},
address = {Las Vegas, {US}},
pages = {191--207},
url = {},
abstract = {Knowledge discovery in databases (KDD) focuses on the
computerized exploration of large amounts of data and on the
discovery of interesting patterns within them. While most work on
KDD has been concerned with structured databases, there has been
little work on handling the huge amount of information that is
available only in unstructured textual form. The paper describes
the KDT system for knowledge discovery in texts. It is built on
top of a text-categorization paradigm where text articles are
annotated with keywords organized in a hierarchical structure.
Knowledge discovery is performed by analyzing the co-occurrence
frequencies of keywords from this hierarchy in the various
documents. The authors show how this term-frequency approach
supports a range of KDD operations, providing a general framework
for knowledge discovery and exploration in collections of
unstructured text.},
}
@inProceedings{Dagan97,
author = {Ido Dagan and Yael Karov and Dan Roth},
title = {Mistake-driven learning in text categorization},
booktitle = {Proceedings of EMNLP-97, 2nd Conference on Empirical Methods in
Natural Language Processing},
publisher = {Association for Computational Linguistics, Morristown, {US}},
editor = {Claire Cardie and Ralph Weischedel},
year = {1997},
address = {Providence, {US}},
pages = {55--63},
url = {http://l2r.cs.uiuc.edu/~danr/Papers/categ.ps.gz},
abstract = {Learning problems in the text processing domain often map the
text to a space whose dimensions are the measured features of the
text, e.g., its words. Three characteristic properties of this
domain are (a) very high dimensionality, (b) both the learned
concepts and the instances reside very sparsely in the feature
space, and (c) a high variation in the number of active features
in an instance. In this work we study three mistake-driven
learning algorithms for a typical task of this nature - text
categorization. We argue that these algorithms which categorize
documents by learning a linear separator in the feature space
have a few properties that make them ideal for this domain. We
then show that a quantum leap in performance is achieved when we
further modify the algorithms to better address some of the
specific characteristics of the domain. In particular, we
demonstrate (1) how variation in document length can be tolerated
by either normalizing feature weights or by using negative
weights, (2) the positive effect of applying a threshold range in
training, (3) alternatives in considering feature frequency, and
(4) the benefits of discarding features while training. Overall,
we present an algorithm, a variation of Littlestone's Winnow,
which performs significantly better than any other algorithm
tested on this task using a similar feature set.},
}
@article{Damashek95,
author = {Marc Damashek},
title = {Gauging Similarity with {N}-Grams: Language-Independent
Categorization of Text},
journal = {Science},
year = {1995},
volume = {267},
number = {5199},
pages = {843--848},
url = {},
abstract = {A language-independent means of gauging topical similarity in
unrestricted text is described. The method combines information
derived from n-grams (consecutive sequences of n characters) with
a simple vector-space technique that makes sorting,
categorization, and retrieval feasible in a large multilingual
collection of documents. No prior information about document
content or language is required. Context, as it applies to
document similarity, can be accommodated by a well-defined
procedure. When an existing document is used as an exemplar, the
completeness and accuracy with which topically related documents
are retrieved is comparable to that of the best existing systems.
The results of a formal evaluation are discussed, and examples
are given using documents in English and Japanese.},
}
@inProceedings{Denoyer01,
author = {Ludovic Denoyer and Hugo Zaragoza and Patrick Gallinari},
title = {{HMM}-based Passage Models for Document Classification and
Ranking},
booktitle = {Proceedings of ECIR-01, 23rd European Colloquium on Information
Retrieval Research},
editor = {},
year = {2001},
address = {Darmstadt, {DE}},
publisher = {},
pages = {126--135},
url = {http://www-connex.lip6.fr/~denoyer/publications/denoyer-final-ecir01.ps},
abstract = {We present an application of Hidden Markov Models to supervised
document classification and ranking. We consider a family of
models that take into account the fact that relevant documents
may contain irrelevant passages; the originality of the model is
that it does not explicitly segment documents but rather
considers all possible segmentations in its final score. This
model generalizes the multinomial Naive Bayes and it is derived
from a more general model for different access tasks. The model
is evaluated on the REUTERS test collection and compared to the
multinomial Naive Bayes model. It is shown to be more robust with
respect to the training set size and to improve the performance
both for ranking and classification, specially for classes with
few training examples.},
}
@inProceedings{Diao00,
author = {Yanlei Diao and Hongjun Lu and Dekai Wu},
title = {A comparative study of classification-based personal e-mail
filtering},
booktitle = {Proceedings of PAKDD-00, 4th Pacific-Asia Conference on Knowledge
Discovery and Data Mining},
editor = {Takao Terano and Huan Liu and Arbee L.P. Chen},
pages = {408--419},
year = {2000},
publisher = {Springer Verlag, Heidelberg, {DE}},
address = {Kyoto, {JP}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 1805},
url = {http://www.cs.berkeley.edu/~diaoyl/publications/pakdd00.ps},
abstract = {This paper addresses personal E-mail filtering by casting it in
the framework of text classification. Modeled as semi-structured
documents, E-mail messages consist of a set of fields with
predefined semantics and a number of variable length free-text
fields. While most work on classification either concentrates on
structured data or free text, the work in this paper deals with
both of them. To perform classification, a naive Bayesian
classifier was designed and implemented, and a decision tree
based classifier was implemented. The design considerations and
implementation issues are discussed. Using a relatively large
amount of real personal E-mail data, a comprehensive comparative
study was conducted using the two classifiers. The importance of
different features is reported. Results of other issues related
to building an effective personal E-mail classifier are presented
and discussed. It is shown that both classifiers can perform
filtering with reasonable accuracy. While the decision tree based
classifier outperforms the Bayesian classifier when features and
training size are selected optimally for both, a carefully
designed naive Bayesian classifier is more robust.},
}
@article{Doyle65,
author = {Lauren B. Doyle},
title = {Is automatic classification a reasonable application of
statistical analysis of text?},
journal = {Journal of the {ACM}},
volume = {12},
number = {4},
year = {1965},
pages = {473--489},
url = {http://www.acm.org/pubs/articles/journals/jacm/1965-12-4/p473-doyle/p473-doyle.pdf},
abstract = {The statistical approach to the analysis of document collections
and retrieval therefrom has proceeded along two main lines,
associative machine searching and automatic classification. The
former approach has been favored because of the tendency of
people in the computer field to strive for new methods of dealing
with the literature -- methods which do not resemble those of
traditional libraries. But automatic classification study also
has been thriving; some of the reasons for this are discussed.
The crucial question of the quality of automatic classification
is treated at considerable length, and empirical data are
introduced to support the hypothesis that classification quality
improves as more information about each document is used for
input to the classification program. Six nonjudgmental criteria
are used in testing the hypothesis for 100 keyword lists (each
list representing a document) for a series of computer runs in
which the number of words per document is increased progressively
from 12 to 36. Four of the six criteria indicate the hypothesis
holds, and two point to no effect. Previous work of this kind has
been confined to the range of one through eight words per
document. Finally, the future of automatic classification and
some of the practical problems to be faced are outlined.},
}
@article{Drucker99,
author = {Harris Drucker and Vladimir Vapnik and Dongui Wu},
title = {Support vector machines for spam categorization},
journal = {{IEEE} Transactions on Neural Networks},
year = {1999},
number = {5},
volume = {10},
pages = {1048--1054},
url = {http://www.monmouth.edu/~drucker/SVM_spam_article_compete.PDF},
abstract = {We study the use of Support Vector Machines (SVMs) in classifying
email as spam or nonspam by comparing it to three other
classification algorithms: Ripper, Rocchio, and boosting decision
trees. These four algorithms were tested on two different data
sets: one data set where the number of features were constrained
to the 1000 best features and another data set where the
dimensionality was over 7000. SVMs performed best when using
binary features. For both data sets, boosting trees and SVMs had
acceptable test performance in terms of accuracy and speed.
However, SVMs had significantly less training time.},
}
@inProceedings{Dumais98,
author = {Susan T. Dumais and John Platt and David Heckerman and Mehran
Sahami},
title = {Inductive learning algorithms and representations for text
categorization},
booktitle = {Proceedings of CIKM-98, 7th ACM International Conference on
Information and Knowledge Management},
publisher = {{ACM} Press, New York, {US}},
editor = {Georges Gardarin and James C. French and Niki Pissinou and Kia
Makki and Luc Bouganim},
year = {1998},
address = {Bethesda, {US}},
pages = {148--155},
url = {http://robotics.stanford.edu/users/sahami/papers-dir/cikm98.pdf},
abstract = {Text categorization the assignment of natural language texts to
one or more predefined categories based on their content is an
important component in many information organization and
management tasks. We compare the effectiveness of five different
automatic learning algorithms for text categorization in terms of
learning speed, real-time classification speed, and
classification accuracy. We also examine training set size, and
alternative document representations. Very accurate text
classifiers can be learned automatically from training examples.
Linear Support Vector Machines (SVMs) are particularly promising
because they are very accurate, quick to train, and quick to
evaluate.},
}
@inProceedings{Dumais00,
author = {Susan T. Dumais and Hao Chen},
title = {Hierarchical classification of {W}eb content},
booktitle = {Proceedings of SIGIR-00, 23rd ACM International Conference on
Research and Development in Information Retrieval},
editor = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong},
publisher = {{ACM} Press, New York, {US}},
address = {Athens, {GR}},
year = {2000},
pages = {256--263},
url = {http://research.microsoft.com/~sdumais/sigir00.pdf},
abstract = {This paper explores the use of hierarchical structure for
classifying a large, heterogeneous collection of web content. The
hierarchical structure is initially used to train different
second-level classifiers. In the hierarchical case, a model is
learned to distinguish a second-level category from other
categories within the same top level. In the flat
non-hierarchical case, a model distinguishes a second-level
category from all other second-level categories. Scoring rules
can further take advantage of the hierarchy by considering only
second-level categories that exceed a threshold at the top level.
We use support vector machine (SVM) classifiers, which have been
shown to be efficient and effective for classification, but not
previously explored in the context of hierarchical
classification. We found small advantages in accuracy for
hierarchical models over flat models. For the hierarchical
approach, we found the same accuracy using a sequential Boolean
decision rule and a multiplicative decision rule. Since the
sequential approach is much more efficient, requiring only
14\%-16\% of the comparisons used in the other approaches, we
find it to be a good choice for classifying text into large
hierarchical structures.},
}
@inProceedings{ElYaniv01,
author = {Ran El-Yaniv and Oren Souroujon},
title = {Iterative Double Clustering for Unsupervised and Semi-supervised
Learning},
booktitle = {Proceedings of ECML-01, 12th European Conference on Machine
Learning},
editor = {Luc De Raedt and Peter A. Flach},
publisher = {Springer Verlag, Heidelberg, {DE}},
address = {Freiburg, {DE}},
year = {2001},
pages = {121--132},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2167},
url = {http://link.springer.de/link/service/series/0558/papers/2167/21670121.pdf},
abstract = {This paper studies the Iterative Double Clustering (IDC)
meta-clustering algorithm, a new extension of the recent Double
Clustering (DC) method of Slonim and Tishby that exhibited
impressive performance on text categorization tasks. Using
synthetically generated data we empirically demonstrate that
whenever the DC procedure is successful in recovering some of the
structure hidden in the data, the extended IDC procedure can
incrementally compute a dramatically better classification, with
minor additional computational resources. We demonstrate that the
IDC algorithm is especially advantageous when the data exhibits
high attribute noise. Our simulation results also show the
effectiveness of IDC in text categorization problems.
Surprisingly, this unsupervised procedure can be competitive with
a (supervised) SVM trained with a small training set. Finally, we
propose a natural extension of IDC for (semi-supervised)
transductive learning where we are given both labeled and
unlabeled examples, and present preliminary empirical results
showing the plausibility of the extended method in a
semi-supervised setting.},
}
@inProceedings{Escudero00,
author = {Gerard Escudero and Llu{\'{\i}}s M{\`{a}}rquez and German Rigau},
title = {Boosting applied to word sense disambiguation},
booktitle = {Proceedings of ECML-00, 11th European Conference on Machine
Learning},
editor = {Ramon L{\'{o}}pez De M{\'{a}}ntaras and Enric Plaza},
address = {Barcelona, {ES}},
pages = {129--141},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 1810},
year = {2000},
url = {http://www.lsi.upc.es/~escudero/recerca/ecml00.pdf},
abstract = {In this paper Schapire and Singer's AdaBoost.MH boosting
algorithm is applied to the Word Sense Disambiguation (WSD)
problem. Initial experiments on a set of 15 selected polysemous
words show that the boosting approach surpasses Naive Bayes and
Exemplar-based approaches, which represent state-of-the-art
accuracy on supervised WSD. In order to make boosting practical
for a real learning domain of thousands of words, several ways of
accelerating the algorithm by reducing the feature space are
studied. The best variant, which we call LazyBoosting, is tested
on the largest sense-tagged corpus available containing 192,800
examples of the 191 most frequent and ambiguous English words.
Again, boosting compares favourably to the other benchmark
algorithms.},
}
@article{Fall03,
author = {C. J. Fall and A. T{\"o}rcsv{\'a}ri and K. Benzineb and G.
Karetka},
title = {Automated Categorization in the International Patent
Classification},
journal = {{SIGIR} Forum},
year = {2003},
pages = {},
volume = {37},
number = {1},
url = {http://www.acm.org/sigir/forum/S2003/CJF_Manuscript_sigir.pdf},
abstract = {A new reference collection of patent documents for training and
testing automated categorization systems is established and
described in detail. This collection is tailored for automating
the attribution of international patent classification codes to
patent applications and is made publicly available for future
research work. We report the results of applying a variety of
machine learning algorithms to the automated categorization of
English-language patent documents. This procedure involves a
complex hierarchical taxonomy, within which we classify documents
into 114 classes and 451 subclasses. Several measures of
categorization success are described and evaluated. We
investigate how best to resolve the training problems related to
the attribution of multiple classification codes to each patent
document.},
}
@inProceedings{Fangmeyer68,
author = {Hermann Fangmeyer and Gerhard Lustig},
title = {The {EURATOM} automatic indexing project},
booktitle = {Proceedings of the IFIP Congress (Booklet J)},
publisher = {},
editor = {},
year = {1968},
address = {Edinburgh, {UK}},
pages = {66--70},
url = {},
abstract = {},
}
@inProceedings{Fangmeyer70,
author = {Hermann Fangmeyer and Gerhard Lustig},
title = {Experiments with the {CETIS} automated indexing system},
booktitle = {Proceedings of the Symposium on the Handling of Nuclear
Information},
publisher = {International Atomic Energy Agency},
editor = {},
year = {1970},
address = {},
pages = {557--567},
url = {},
abstract = {},
}
@inProceedings{Ferilli01,
author = {Stefano Ferilli and Nicola Fanizzi and Gianni Semeraro},
title = {Learning logic models for automated text categorization},
booktitle = {Proceedings of AI*IA-01, 7th Congress of the Italian Association
for Artificial Intelligence},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2175},
editor = {Floriana Esposito},
year = {2001},
pages = {81--86},
address = {Bari, {IT}},
url = {http://link.springer.de/link/service/series/0558/papers/2175/21750081.pdf},
abstract = {This work addresses a logical approach to text categorization
inside a framework aimed at full automatic paper document
processing. The logic representation of sentences required by the
adopted learning algorithm is obtained by detecting structure in
raw text trough a parser. A preliminary experimentation proved
that the logic approach is able to capture the semantics
underlying some kind of sentences, even if the assessment of the
efficiency of such a method, as well as a comparison with other
related approaches, has still to be carried out.},
}
@article{Field75,
author = {B.J. Field},
title = {Towards automatic indexing: automatic assignment of
controlled-language indexing and classification from free
indexing},
year = {1975},
journal = {Journal of Documentation},
volume = {31},
number = {4},
pages = {246--265},
url = {},
abstract = {},
}
@inProceedings{Finn02,
author = {Aidan Finn and Nicholas Kushmerick and Barry Smyth},
title = {Genre Classification and Domain Transfer for Information
Filtering},
booktitle = {Proceedings of ECIR-02, 24th European Colloquium on Information
Retrieval Research},
editor = {Fabio Crestani and Mark Girolami and Cornelis J. Van Rijsbergen},
year = {2002},
address = {Glasgow, {UK}},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2291},
pages = {353--362},
url = {http://www.cs.ucd.ie/staff/nick/home/research/download/finn-ecir2002.ps.gz},
abstract = {The World Wide Web is a vast repository of information, but the
sheer volume makes it difficult to identify useful documents. We
identify document genre is an important factor in retrieving
useful documents and focus on the novel document genre dimension
of subjectivity. We investigate three approaches to automatically
classifying documents by genre: traditional bag of words
techniques, part-of-speech statistics, and hand-crafted shallow
linguistic features. We are particularly interested in domain
transfer: how well the learned classifiers generalize from the
training corpus to a new document corpus. Our experiments
demonstrate that the part-of-speech approach is better than
traditional bag of words techniques, particularly in the domain
transfer conditions.},
}
@inProceedings{Fisher03,
author = {Michelle Fisher and Richard Everson},
title = {When are links useful? Experiments in text classification},
booktitle = {Proceedings of ECIR-03, 25th European Conference on Information
Retrieval},
publisher = {Springer Verlag},
editor = {Fabrizio Sebastiani},
address = {Pisa, {IT}},
year = {2003},
pages = {41--56},
url = {http://link.springer.de/link/service/series/0558/papers/2633/26330041.pdf},
abstract = {Link analysis methods have become popular for information access
tasks, especially information retrieval, where the link
information in a document collection is used to complement the
traditionally used content information. However, there has been
little firm evidence to confirm the utility of link information.
We show that link information can be useful when the document
collection has a sufficiently high link density and links are of
sufficiently high quality. We report experiments on text
classification of the Cora and WebKB data sets using
Probabilistic Latent Semantic Analysis and Probabilistic
Hypertext Induced Topic Selection. Comparison with manually
assigned classes shows that link information enhances
classification in data with sufficiently high link density, but
is detrimental to performance at low link densities or if the
quality of the links is degraded. We introduce a new
frequency-based method for selecting the most useful citations
from a document collection for use in the model.},
}
@inCollection{Forsyth99,
author = {Richard S. Forsyth},
title = {New directions in text categorization},
editor = {Alex Gammerman},
booktitle = {Causal models and intelligent data management},
publisher = {Springer Verlag},
address = {Heidelberg, {DE}},
year = {1999},
pages = {151--185},
url = {},
abstract = {},
}
@inProceedings{Frank00,
author = {Eibe Frank and Chang Chui and Ian H. Witten},
title = {Text Categorization Using Compression Models},
booktitle = {Proceedings of DCC-00, IEEE Data Compression Conference},
editor = {Storer, James A. and Cohn, Martin},
publisher = {{IEEE} Computer Society Press, Los Alamitos, {US}},
year = {2000},
address = {Snowbird, {US}},
pages = {200--209},
url = {http://dlib.computer.org/conferen/dcc/0592/pdf/05920555.pdf},
abstract = {Text categorization is the assignment of natural language texts
to predefined categories based on their content. It has often
been observed that compression seems to provide a very promising
approach to categorization. The overall compression of an article
with respect to different models can be compared to see which one
it fits most closely. Such a scheme has several potential
advantages because it does not require any pre-processing of the
input text. We have performed extensive experiments on the use of
PPM compression models for categorization using the standard
Reuters-21578 dataset. We obtained some encouraging results on
two-category situations, and the results on the general problem
seem reasonably impressive---in one case outstanding. However, we
find that PPM does not compete with the published state of the
art in the use of machine learning for text categorization. It
produces inferior results because it is insensitive to subtle
differences between articles that belong to a category and those
that do not. We do not believe our results are specific to PPM.
If the occurrence of a single word determines whether an article
belongs to a category or not (and it often does) any compression
scheme will likely fail to classify the article correctly.
Machine learning schemes fare better because they automatically
eliminate irrelevant features and concentrate on the most
discriminating ones.},
}
@inProceedings{Frasconi01,
author = {Paolo Frasconi and Giovanni Soda and Alessandro Vullo},
title = {Text Categorization for Multi-page Documents: A Hybrid Naive
{Bayes HMM} Approach},
booktitle = {Proceedings of JCDL, 1st ACM-IEEE Joint Conference on Digital
Libraries},
editor = {},
publisher = {{IEEE} Computer Society Press, Los Alamitos, {US}},
year = {2001},
address = {Roanoke, {US}},
pages = {11--20},
url = {http://www.dsi.unifi.it/~paolo/ps/jcdl01-hmm-text.pdf},
abstract = {Text categorization is typically formulated as a concept learning
problem where each instance is a single isolated document. In
this paper we are interested in a more general formulation where
documents are organized as page sequences, as naturally occurring
in digital libraries of scanned books and magazines. We describe
a method for classifying pages of sequential OCR text documents
into one of several assigned categories and suggest that taking
into account contextual information provided by the whole page
sequence can significantly improve classification accuracy. The
proposed architecture relies on hidden Markov models whose
emissions are bag-of-words according to a multinomial word event
model, as in the generative portion of the Naive Bayes
classifier. Our results on a collection of scanned journals from
the Making of America project confirm the importance of using
whole page sequences. Empirical evaluation indicates that the
error rate (as obtained by running a plain Naive Bayes classifier
on isolated page) can be roughly reduced by half if contextual
information is incorporated.},
}
@article{Frasconi02,
author = {Paolo Frasconi and Giovanni Soda and Alessandro Vullo},
title = {Text Categorization for Multi-page Documents: A Hybrid Naive
{Bayes HMM} Approach},
journal = {Journal of Intelligent Information Systems},
year = {2002},
note = {Special Issue on Automated Text Categorization},
volume = {18},
number = {2/3},
pages = {195--217},
url = {http://www.wkap.nl/article.pdf?391247},
abstract = {In the traditional setting, text categorization is formulated as
a concept learning problem where each instance is a single
isolated document. However, this perspective is not appropriate
in the case of many digital libraries that offer as contents
scanned and optically read books or magazines. In this paper, we
propose a more general formulation of text categorization,
allowing documents to be organized as \textit{sequences} of
pages. We introduce a novel hybrid system specifically designed
for multi-page text documents. The architecture relies on hidden
Markov models whose emissions are bag-of-words resulting from a
multinomial word event model, as in the generative portion of the
Naive Bayes classifier. The rationale behind our proposal is that
taking into account contextual information provided by the whole
page sequence can help disambiguation and improves single page
classification accuracy. Our results on two datasets of scanned
journals from the Making of America collection confirm the
importance of using whole page sequences. The empirical
evaluation indicates that the error rate (as obtained by running
the Naive Bayes classifier on isolated pages) can be
significantly reduced if contextual information is incorporated.},
}
@inProceedings{Frommholz01,
author = {Ingo Frommholz},
title = {Categorizing {W}eb Documents in Hierarchical Catalogues},
booktitle = {Proceedings of ECIR-01, 23rd European Colloquium on Information
Retrieval Research},
editor = {},
year = {2001},
address = {Darmstadt, {DE}},
publisher = {},
pages = {},
url = {http://ls6-www.informatik.uni-dortmund.de/bib/fulltext/ir/Frommholz:01a.pdf},
abstract = {Automatic categorization of web documents (e.g. HTML documents)
denotes the task of automatically finding relevant categories for
a (new) document which is to be inserted into a web catalogue
like Yahoo!. There exist many approaches for performing this
difficult task. Here, special kinds of web catalogues, those
whose category scheme is hierarchically ordered, are regarded. A
method for using the knowledge about the hierarchy to gain better
categorization results is discussed. This method can be applied
in a post-processing step and therefore be combined with other
known (non-hierarchical) categorization approaches.},
}
@inProceedings{Fuhr84,
author = {Fuhr, Norbert and Knorz, Gerhard},
title = {Retrieval test evaluation of a rule-based automated indexing
{(AIR/PHYS)}},
booktitle = {Proceedings of SIGIR-84, 7th ACM International Conference on
Research and Development in Information Retrieval},
year = {1984},
publisher = {Cambridge University Press},
editor = {Cornelis J. Van Rijsbergen},
pages = {391--408},
address = {Cambridge, {UK}},
url = {},
abstract = {},
}
@inProceedings{Fuhr85,
author = {Fuhr, Norbert},
title = {A probabilistic model of dictionary-based automatic indexing},
booktitle = {Proceedings of RIAO-85, 1st International Conference ``Recherche
d'Information Assistee par Ordinateur''},
publisher = {},
editor = {},
address = {Grenoble, {FR}},
year = {1985},
pages = {207--216},
url = {},
abstract = {},
}
@inProceedings{Fuhr91a,
author = {Fuhr, Norbert and Hartmann, Stephan and Knorz, Gerhard and
Lustig, Gerhard and Schwantner, Michael and Tzeras, Konstadinos},
title = {{AIR/X} -- a Rule-Based Multistage Indexing System for Large
Subject Fields},
booktitle = {Proceedings of RIAO-91, 3rd International Conference ``Recherche
d'Information Assistee par Ordinateur''},
publisher = {Elsevier Science Publishers, Amsterdam, {NL}},
editor = {Andr{\'e} Lichnerowicz},
address = {Barcelona, {ES}},
year = {1991},
pages = {606--623},
url = {http://www.darmstadt.gmd.de/~tzeras/FullPapers/gz/Fuhr-etal-91.ps.gz},
abstract = {AIR/X is a rule-based system for indexing with terms
(descriptors) from a prescribed vocabulary. For this task, an
indexing dictionary with rules for mapping terms from the text
onto descriptors is required, which can be derived automatically
from a set of manually indexed documents. Based on the Darmstadt
Indexing Approach, the indexing task is divided into a
description step and a decision step. First, terms (single words
or phrases) are identified in the document text. With
term-descriptor rules from the dictionary, descriptor indications
are formed. The set of all indications from a document leading to
the same descriptor is called a relevance description. A
probabilistic classification procedure computes indexing weights
for each relevance description. Since the whole system is
rule-based, it can be adapted to different subject fields by
appropriate modifications of the rule bases. A major application
of AIR/X is the AIR/PHYS system developed for a large physics
database. This application is described in more detail along with
experimental results.},
}
@inProceedings{Fuhr91b,
author = {Norbert Fuhr and Ulrich Pfeifer},
title = {Combining Model-Oriented and Description-Oriented Approaches for
Probabilistic Indexing},
booktitle = {Proceedings of SIGIR-91, 14th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Abraham Bookstein and Yves Chiaramella and Gerard Salton and
Vijay V. Raghavan},
publisher = {{ACM} Press, New York, {US}},
address = {Chicago, {US}},
pages = {46--56},
year = {1991},
note = {An extended version appears as~\cite{Fuhr94}},
url = {http://www.acm.org/pubs/articles/proceedings/ir/122860/p46-fuhr/p46-fuhr.pdf},
abstract = {We distinguish model-oriented and description-oriented approaches
in probabilistic information retrieval. The former refer to
certain representations of documents and queries and use
additional independence assumptions, whereas the latter map
documents and queries onto feature vectors which form the input
to certain classification procedures or regression methods.
Description-oriented approaches are more flexible with respect to
the underlying representations, but the definition of the feature
vector is a heuristic step. In this paper, we combine a
probabilistic model for the Darmstadt Indexing Approach with
logistic regression. Here the probabilistic model forms a
guideline for the definition of the feature vector. Experiments
with the purely theoretical approach and with several heuristic
variations show that heuristic assumptions may yield significant
improvements.},
}
@article{Fuhr94,
author = {Norbert Fuhr and Ulrich Pfeifer},
title = {Probabilistic Information Retrieval as Combination of Abstraction
Inductive Learning and Probabilistic Assumptions},
journal = {{ACM} Transactions on Information Systems},
year = {1994},
number = {1},
volume = {12},
pages = {92-115},
url = {http://ls6-www.informatik.uni-dortmund.de/bib/fulltext/ir/Fuhr_Pfeifer:94.ps.gz},
abstract = {We show that former approaches in probabilistic information
retrieval are based on one or two of the three concepts
abstraction, inductive learning and probabilistic assumptions,
and we propose a new approach which combines all three concepts.
This approach is illustrated for the case of indexing with a
controlled vocabulary. For this purpose, we describe a new
probabilistic model #rst, which is then combined with logistic
regression, thus yielding a generalization of the original model.
Experimental results for the pure theoretical model as well as
for heuristic variants are given. Furthermore, linear and
logistic regression are compared.},
}
@inProceedings{Furnkranz99,
author = {Johannes F{\"{u}}rnkranz},
title = {Exploiting Structural Information for Text Classification on the
{WWW}},
booktitle = {Proceedings of IDA-99, 3rd Symposium on Intelligent Data Analysis},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 1642},
editor = {David J. Hand and Joost N. Kok and Michael R. Berthold},
address = {Amsterdam, {NL}},
year = {1999},
pages = {487--497},
url = {http://www.ai.univie.ac.at/~juffi/publications/ida-99.ps.gz},
abstract = {In this paper, we report on a set of experiments that explore the
utility of making use of the structural information of WWW
documents. Our working hypothesis is that it is often easier to
classify a hypertext page using information provided on pages
that point to it instead of using information that is provided on
the page itself. We present experimental evidence that confirms
this hypothesis on a set of Web pages that relate to computer
science departments.},
}
@article{Furnkranz02,
author = {Johannes F{\"{u}}rnkranz},
title = {Hyperlink Ensembles: A Case Study in Hypertext Classification},
journal = {Information Fusion},
year = {2002},
number = {4},
volume = {3},
pages = {299--312},
url = {},
abstract = {In this paper, we introduce hyperlink ensembles, a novel type of
ensemble classifier for classifying hypertext documents. Instead
of using the text on a page for deriving features that can be
used for training a classifier, we suggest to use portions of
texts from all pages that point to the target page. A hyperlink
ensemble is formed by obtaining one prediction for each hyperlink
that points to a page. These individual predictions for each
hyperlink are subsequently combined to a final prediction for the
class of the target page. We explore four different ways of
combining the individual predictions and four different
techniques for identifying relevant text portions. The utility of
our approach is demonstrated on a set of Web-pages that relate to
Computer Science Departments.},
}
@inProceedings{Galavotti00,
author = {Luigi Galavotti and Fabrizio Sebastiani and Maria Simi},
title = {Experiments on the use of feature selection and negative evidence
in automated text categorization},
booktitle = {Proceedings of ECDL-00, 4th European Conference on Research and
Advanced Technology for Digital Libraries},
editor = {Jos{\'e} L. Borbinha and Thomas Baker},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 1923},
year = {2000},
address = {Lisbon, {PT}},
pages = {59--68},
url = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/ECDL00.pdf},
abstract = {We tackle two different problems of {\em text categorization}
(TC), namely feature selection and classifier induction. {\em
Feature selection} (FS) refers to the activity of selecting, from
the set of $r$ distinct features (i.e.\ words) occurring in the
collection, the subset of $r'\ll r$ features that are most useful
for compactly representing the meaning of the documents. We
propose a novel FS technique, based on a simplified variant of
the $\chi^2$ statistics. {\em Classifier induction} refers
instead to the problem of automatically building a text
classifier by learning from a set of documents pre-classified
under the categories of interest. We propose a novel variant,
based on the exploitation of negative evidence, of the well-known
$k$-NN method. We report the results of systematic
experimentation of these two methods performed on the standard
{\sc Reuters-21578} benchmark.},
}
@article{Gale93,
author = {William A. Gale and Kenneth W. Church and David Yarowsky},
title = {A method for disambiguating word senses in a large corpus},
journal = {Computers and the Humanities},
year = {1993},
number = {5},
volume = {26},
pages = {415--439},
url = {http://www.research.att.com/~kwc/published_1993_sense.ps},
abstract = {},
}
@inProceedings{Gao03,
author = {Sheng Gao and Wen Wu and Chin-Hui Lee and Tat-Seng Chua},
title = {A maximal figure-of-merit learning approach to text
categorization},
booktitle = {Proceedings of SIGIR-03, 26th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Jamie Callan and Gordon Cormack and Charles Clarke and David
Hawking and Alan Smeaton},
publisher = {{ACM} Press, New York, {US}},
address = {Toronto, {CA}},
year = {2003},
pages = {174--181},
url = {http://doi.acm.org/10.1145/860435.860469},
abstract = {A novel maximal figure-of-merit (MFoM) learning approach to text
categorization is proposed. Different from the conventional
techniques, the proposed MFoM method attempts to integrate any
performance metric of interest (e.g. accuracy, recall, precision,
or F1 measure) into the design of any classifier. The
corresponding classifier parameters are learned by optimizing an
overall objective function of interest. To solve this highly
nonlinear optimization problem, we use a generalized
probabilistic descent algorithm. The MFoM learning framework is
evaluated on the Reuters-21578 task with LSI-based feature
extraction and a binary tree classifier. Experimental results
indicate that the MFoM classifier gives improved F1 and enhanced
robustness over the conventional one. It also outperforms the
popular SVM method in micro-averaging F1. Other extensions to
design discriminative multiple-category MFoM classifiers for
application scenarios with new performance metrics could be
envisioned too.},
}
@inProceedings{Gaussier02,
author = {{\'{E}}ric Gaussier and Cyril Goutte and Kris Popat and Francine
Chen},
title = {A hierarchical model for clustering and categorising documents},
booktitle = {Proceedings of ECIR-02, 24th European Colloquium on Information
Retrieval Research},
editor = {Fabio Crestani and Mark Girolami and Cornelis J. Van Rijsbergen},
year = {2002},
address = {Glasgow, {UK}},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2291},
pages = {229--247},
url = {http://link.springer.de/link/service/series/0558/papers/2291/22910229.pdf},
abstract = {We propose a new hierarchical generative model for textual data,
where words may be generated by topic specific distributions at
any level in the hierarchy. This model is naturally well-suited
to clustering documents in preset or automatically generated
hierarchies, as well as categorising new documents in an existing
hierarchy. Training algorithms are derived for both cases, and
illustrated on real data by clustering news stories and
categorising newsgroup messages. Finally, the generative model
may be used to derive a Fisher kernel expressing similarity
between documents.},
}
@article{Gentili01,
author = {G.L. Gentili and Mauro Marinilli and Alessandro Micarelli and
Filippo Sciarrone},
title = {Text categorization in an intelligent agent for filtering
information on the {W}eb},
journal = {International Journal of Pattern Recognition and Artificial
Intelligence},
pages = {527--549},
year = {2001},
number = {3},
volume = {15},
url = {http://www.worldscinet.com/journals/ijprai/15/preserved-docs/1503/S021800140100099X.pdf},
abstract = {This paper presents a text categorization system, capable of
analyzing HTML/text documents collected from the Web. The system
is a component of a more extensive intelligent agent for adaptive
information filtering on the Web. It is based on a hybrid
case-based architecture, where two multilayer perceptrons are
integrated into a case-based reasoner. An empirical evaluation of
the system was performed by means of a confidence interval
technique. The experimental results obtained are encouraging and
support the choice of a hybrid case-based approach to text
categorization.},
}
@inProceedings{Geutner93,
author = {Petra Geutner and Uli Bodenhausen and Alex Waibel},
title = {Flexibility Through Incremental Learning: Neural Networks for
Text Categorization},
booktitle = {Proceedings of WCNN-93, World Congress on Neural Networks},
publisher = {},
editor = {},
year = {1993},
address = {Portland, {US}},
pages = {24--27},
url = {http://werner.ira.uka.de/papers/speech/1993/WCNN_93_petra_geutner.ps.gz},
abstract = {In this paper we show an adaptive incremental learning algorithm
that learns interactively to classify text messages (here:
emails) into categories without the need for lengthy batch
training runs. The algorithm was evaluated on a large database of
email messages that fall into five subjective categories. As
control experiment best human categorization performance was
established at 79.4\% for this task. The best of all
connectionist architectures presented here achieves near human
performance (79.1\%). This architecture acquires its language
model and dictionary adaptively and hence avoids handcoding of
either. The learning algorithm combines an adaptive phase which
instantly updates dictionary and weights during interaction and a
tuning phase which fine tunes for performance using previously
seen data. Such systems can be deployed in various applications
where instantaneous interactive learning is necessary such as
on-line email or news categorization, text summarization and
information filtering in general.},
}
@inProceedings{Ghani00,
author = {Rayid Ghani},
title = {Using error-correcting codes for text classification},
booktitle = {Proceedings of ICML-00, 17th International Conference on Machine
Learning},
editor = {Pat Langley},
year = {2000},
address = {Stanford, {US}},
pages = {303--310},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {http://www.cs.cmu.edu/~rayid/mypapers/ecoc-icml.ps},
abstract = {This paper explores in detail the use of Error Correcting Output
Coding (ECOC) for learning text classifiers. We show that the
accuracy of a Naive Bayes Classifier over text classification
tasks can be significantly improved by taking advantage of the
error-correcting properties of the code. We also explore the use
of different kinds of codes, namely Error-Correcting Codes,
Random Codes, and Domain and Data-specific codes and give
experimental results for each of them. The ECOC method scales
well to large data sets with a large number of classes.
Experiments on a real-world data set show a reduction in
classification error by up to 66\% over the traditional Naive
Bayes Classifier. We also compare our empirical results to
semi-theoretical results and find that the two closely agree.},
}
@inProceedings{Ghani01,
author = {Rayid Ghani and Se{\'{a}}n Slattery and Yiming Yang},
title = {Hypertext Categorization using Hyperlink Patterns and Meta Data},
booktitle = {Proceedings of ICML-01, 18th International Conference on Machine
Learning},
editor = {Carla Brodley and Andrea Danyluk},
address = {Williams College, {US}},
year = {2001},
pages = {178--185},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {http://www.cs.cmu.edu/~yiming/papers.yy/hypertext-icml01.ps.gz},
abstract = {Hypertext poses new text classification research challenges as
hyperlinks, content of linked documents, and meta data about
related web sites all provide richer sources of information for
hypertext classification that are not available in traditional
text classification. We investigate the use of such information
for representing web sites, and the effectiveness of different
classifiers (Naive Bayes, Nearest Neighbor, and {\sc Foil}) in
exploiting those representations. We find that using words in web
pages alone often yields suboptimal performance of classifiers,
compared to exploiting additional sources of information beyond
document content. On the other hand, we also observe that linked
pages can be more harmful than helpful when the linked
neighborhoods are highly ``noisy'' and that links have to be used
in a careful manner. More importantly, our investigation suggests
that meta data which is often available, or can be acquired using
Information Extraction techniques, can be extremely useful for
improving classification accuracy. Finally, the relative
performance of the different classifiers being tested gives us
insights into the strengths and limitations of our algorithms for
hypertext classification.},
}
@inProceedings{Ghani01a,
author = {Rayid Ghani},
title = {Combining Labeled and Unlabeled data for Text Classification with
a Large Number of Categories},
booktitle = {Proceedings of the IEEE International Conference on Data Mining},
editor = {Nick Cercone and Tsau Young Lin and Xindong Wu},
address = {San Jose, {US}},
year = {2001},
pages = {597--598},
publisher = {{IEEE} Computer Society, Los Alamitos, {US}},
url = {http://www.cs.cmu.edu/~rayid/mypapers/icdm01.ps},
abstract = {We develop a framework to incorporate unlabeled data in the
Error-Correcting Output Coding (ECOC) setup by de-composing
multiclass problems into multiple binary prob-lems and then use
Co-Training to learn the individual bi-nary classification
problems. We show that our method is especially useful for
classification tasks involving a large number of categories where
Co-training doesn¹t perform very well by itself and when combined
with ECOC, outper-forms several other algorithms that combine
labeled and unlabeled data for text classification in terms of
accuracy, precision-recall tradeoff, and efficiency.},
}
@inProceedings{Ghani02,
author = {Rayid Ghani},
title = {Combining Labeled and Unlabeled Data for MultiClass Text
Categorization},
booktitle = {Proceedings of ICML-02, 19th International Conference on Machine
Learning},
editor = {},
year = {2002},
address = {Sydney, {AU}},
pages = {},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {http://www.accenture.com/xdoc/en/services/technology/publications/Ghani-ICML02.pdf},
abstract = {Supervised learning techniques for text classification often
require a large number of labeled examples to learn accurately.
One way to reduce the amount of labeled data required is to
develop algorithms that can learn effectively from a small number
of labeled examples augmented with a large number of unlabeled
examples. Current text learning techniques for combining labeled
and unlabeled, such as EM and Co-Training, are mostly applicable
for classification tasks with a small number of classes and do
not scale up well for large multiclass problems. In this paper,
we develop a framework to incorporate unlabeled data in the
Error-Correcting Output Coding (ECOC) setup by first decomposing
multiclass problems into multiple binary problems and then using
Co-Training to learn the individual binary classification
problems. We show that our method is especially useful for text
classification tasks involving a large number of categories and
outperforms other semi-supervised learning techniques such as EM
and Co-Training. In addition to being highly accurate, this
method utilizes the hamming distance from ECOC to provide
high-precision results. We also present results with algorithms
other than co-training in this framework and show that
co-training is uniquely suited to work well within ECOC.},
}
@inProceedings{Giorgetti03,
author = {Daniela Giorgetti and Fabrizio Sebastiani},
title = {Multiclass Text Categorization for Automated Survey Coding},
year = {2003},
address = {Melbourne, {US}},
booktitle = {Proceedings of SAC-03, 18th ACM Symposium on Applied Computing},
publisher = {{ACM} Press, New York, {US}},
pages = {798--802},
url = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/SAC03a.pdf},
abstract = {\emph{Survey coding} is the task of assigning a symbolic code
from a predefined set of such codes to the answer given in
response to an open-ended question in a questionnaire (aka
\emph{survey}). We formulate the problem of automated survey
coding as a \emph{text categorization} problem, i.e.\ as the
problem of learning, by means of supervised machine learning
techniques, a model of the association between answers and codes
from a training set of pre-coded answers, and applying the
resulting model to the classification of new answers. In this
paper we experiment with two different learning techniques, one
based on na\"{\i}ve Bayesian classification and the other one
based on multiclass support vector machines, and test the
resulting framework on a corpus of social surveys. The results we
have obtained significantly outperform the results achieved by
previous automated survey coding approaches.},
}
@article{Giorgetti03a,
author = {Daniela Giorgetti and Fabrizio Sebastiani},
title = {Automating Survey Coding by Multiclass Text Categorization
Techniques},
journal = {Journal of the American Society for Information Science and
Technology},
year = {2003},
volume = {},
number = {},
pages = {},
url = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/JASIST03.pdf},
abstract = {\emph{Survey coding} is the task of assigning a symbolic code
from a predefined set of such codes to the answer given in
response to an open-ended question in a questionnaire (aka
\emph{survey}). This task is usually carried out in order to
group respondents according to a predefined scheme based on their
answers. Survey coding has several applications, especially in
the social sciences, ranging from the simple classification of
respondents to the extraction of statistics on political
opinions, health and lifestyle habits, customer satisfaction,
brand fidelity, and patient satisfaction. Survey coding is a
difficult task, since the code that should be attributed to a
respondent based on the answer she has given is a matter of
subjective judgment, and thus requires expertise. It is thus
unsurprising that this task has traditionally been performed
manually, by trained coders. Some attempts have been made at
automating this task, most of them based on detecting the
similarity between the answer and textual descriptions of the
meanings of the candidate codes. We take a radically new stand,
and formulate the problem of automated survey coding as a
\emph{text categorization} problem, i.e.\ as the problem of
learning, by means of supervised machine learning techniques, a
model of the association between answers and codes from a
training set of pre-coded answers, and applying the resulting
model to the classification of new answers. In this paper we
experiment with two different learning techniques, one based on
na\"{\i}ve Bayesian classification and the other one based on
multiclass support vector machines, and test the resulting
framework on a corpus of social surveys. The results we have
obtained significantly outperform the results achieved by
previous automated survey coding approaches.},
note = {Forthcoming},
}
@inProceedings{Glover02,
author = {Eric J. Glover and Kostas Tsioutsiouliklis and Steve Lawrence and
David M. Pennock and Gary W. Flake},
title = {Using {W}eb structure for classifying and describing {W}eb pages},
booktitle = {Proceedings of WWW-02, International Conference on the World Wide
Web},
address = {Honolulu, {US}},
year = {2002},
pages = {562--569},
publisher = {{ACM} Press, New York, {US}},
url = {http://www.cs.princeton.edu/~kt/www02.ps},
abstract = {The structure of the web is increasingly being used to improve
organization, search, and analysis of information on the web. For
example, Google uses the text in citing documents (documents that
link to the target document) for search. We analyze the relative
utility of document text, and the text in citing documents near
the citation, for classification and description. Results show
that the text in citing documents, when available, often has
greater discriminative and descriptive power than the text in the
target document itself. The combination of evidence from a
document and citing documents can improve on either information
source alone. Moreover, by ranking words and phrases in the
citing documents according to expected entropy loss, we are able
to accurately name clusters of web pages, even with very few
positive examples. Our results confirm, quantify, and extend
previous research using web structure in these areas, introducing
new methods for classification and description of pages.},
}
@inProceedings{Goldberg95,
author = {Goldberg, Jeffrey L.},
title = {{CDM}: an approach to learning in text categorization},
booktitle = {Proceedings of ICTAI-95, 7th International Conference on Tools
with Artificial Intelligence},
publisher = {{IEEE} Computer Society Press, Los Alamitos, {US}},
editor = {},
address = {Herndon, {US}},
year = {1995},
pages = {258--265},
url = {},
note = {An extended version appears as~\cite{Goldberg96}},
abstract = {The category discrimination method (CDM) is a new learning
algorithm designed for text categorization. The motivation is
that there are statistical problems associated with natural
language text when it is applied as input to existing machine
learning algorithms (too much noise, too many features, skewed
distribution). The bases of the CDM are research results about
the way that humans learn categories and concepts vis-a-vis
contrasting concepts. The essential formula is cue validity
borrowed from cognitive psychology, and used to select from all
possible single word-based features the `best` predictors of a
given category. The hypothesis that CDM`s performance exceeds two
non-domain specific algorithms, Bayesian classification and
decision tree learners, is empirically tested.},
}
@article{Goldberg96,
author = {Goldberg, Jeffrey L.},
title = {{CDM}: an approach to learning in text categorization},
journal = {International Journal on Artificial Intelligence Tools},
year = {1996},
number = {1/2},
volume = {5},
pages = {229--253},
url = {},
abstract = {The Category Discrimination Method (CDM) is a new machine
learning algorithm designed specifically for text categorization.
The motivation is that there are statistical problems associated
with natural language text when it is applied as input to
existing machine learning algorithms (too much noise, too many
features, skewed distribution). The bases of the CDM are research
results about the way that humans learn categories and concepts
vis-a-vis contrasting concepts. The essential formula is cue
validity borrowed from cognitive psychology, and used to select
from all possible single word based features, the best predictors
of a given category. The hypothesis that CDM's performance will
exceed two non domain specific algorithms, Bayesian
classification and decision tree learners, is empirically tested.},
}
@inProceedings{Goodman90,
author = {Marc Goodman},
title = {{\sc Prism}: a case-based telex classifier},
booktitle = {Proceedings of IAAI-90, 2nd Conference on Innovative Applications
of Artificial Intelligence},
publisher = {{AAAI} Press, Menlo Park, {US}},
editor = {Alain Rappaport and Reid Smith},
year = {1990},
address = {},
pages = {25--37},
url = {},
abstract = {},
}
@inProceedings{Goevert99,
author = {Norbert G{\"{o}}vert and Mounia Lalmas and Norbert Fuhr},
title = {A probabilistic description-oriented approach for categorising
{W}eb documents},
booktitle = {Proceedings of CIKM-99, 8th ACM International Conference on
Information and Knowledge Management},
publisher = {{ACM} Press, New York, {US}},
editor = {},
year = {1999},
address = {Kansas City, {US}},
pages = {475--482},
url = {http://ls6-www.informatik.uni-dortmund.de/ir/publications/1999/Goevert_etal:99.html},
abstract = {The automatic categorisation of web documents is becoming crucial
for organising the huge amount of information available in the
Internet. We are facing a new challenge due to the fact that web
documents have a rich structure and are highly heterogeneous. Two
ways to respond to this challenge are (1) using a representation
of the content of web documents that captures these two
characteristics and (2) using more effective classifiers. Our
categorisation approach is based on a probabilistic
description-oriented representation of web documents, and a
probabilistic interpretation of the k-nearest neighbour
classifier. With the former, we provide an enhanced document
representation that incorporates the structural and heterogeneous
nature of web documents. With the latter, we provide a
theoretical sound justification for the various parameters of the
k-nearest neighbour classifier. Experimental results show that
(1) using an enhanced representation of web documents is crucial
for an effective categorisation of web documents, and (2) a
theoretical interpretation of the k-nearest neighbour classifier
gives us improvement over the standard k-nearest neighbour
classifier.},
}
@inProceedings{Gomez02,
author = {G{\'o}mez-Hidalgo, Jos{\'e} M. and De Buenaga Rodr{\'{\i}}guez,
Jos{\'e} M. and Ureña L{\'o}pez, Luis A. and Mart{\'{\i}}n
Valdivia, Maria T. and Garc{\'{\i}}a Vega, Manuel},
title = {Integrating Lexical Knowledge in Learning-Based Text
Categorization},
booktitle = {Proceedings of JADT-02, 6th International Conference on the
Statistical Analysis of Textual Data},
publisher = {},
editor = {},
address = {St-Malo, {FR}},
pages = {},
year = {2002},
url = {http://www.cavi.univ-paris3.fr/lexicometrica/jadt/jadt2002/PDF-2002/gomez_debuenaga_urena_martin_garcia.pdf
},
abstract = {Automatic Text Categorization (ATC) is an important task in the
field of Information Access. The prevailing approach to ATC is
making use of a a collection of prelabeled texts for the
induction of a document classifier through learning methods. With
the increasing availability of lexical resources in electronic
form (including Lexical Databases (LDBs), Machine Readable
Dictionaries, etc.), there is an interesting opportunity for the
integration of them in learning-based ATC. In this paper, we
present an approach to the integration of lexical knowledge
extracted from the LDB WordNet in learning-based ATC, based on
Stacked Generalization (SG). The method we suggest is based on
combining the lexical knowledge extracted from the LDB
interpreted as a classifier with a learning-based classifier,
through SG. We have performed experiments which results show that
the ideas we describe are promising and deserve further
investigation.},
}
@inProceedings{Gomez02a,
author = {G{\'o}mez-Hidalgo, Jos{\'e} M.},
title = {Evaluating Cost-Sensitive Unsolicited Bulk Email Categorization},
booktitle = {Proceedings of SAC-02, 17th ACM Symposium on Applied Computing},
editor = {},
address = {Madrid, {ES}},
pages = {615--620},
year = {2002},
url = {http://doi.acm.org/10.1145/508791.508911},
abstract = {In the recent years, Unsolicited Bulk Email has became an
increasingly important problem, with a big economic impact. In
this paper, we discuss cost-sensitive Text Categorization methods
for UBE filtering. In concrete, we have evaluated a range of
Machine Learning methods for the task (C4.5, Naive Bayes, PART,
Support Vector Machines and Rocchio), made cost sensitive through
several methods (Threshold Optimization, Instance Weighting, and
Meta-Cost). We have used the Receiver Operating Characteristic
Convex Hull method for the evaluation, that best suits
classification problems in which target conditions are not known,
as it is the case. Our results do not show a dominant algorithm
nor method for making algorithms cost-sensitive, but are the best
reported on the test collection used, and approach real-world
hand-crafted classifiers accuracy.},
}
@article{Gray71,
author = {W. A. Gray and A. J. Harley},
title = {Computer-assisted indexing},
journal = {Information Storage and Retrieval},
year = {1971},
volume = {7},
number = {4},
pages = {167--174},
url = {},
abstract = {},
}
@inProceedings{Guthrie94,
author = {Louise Guthrie and Elbert Walker and Joe A. Guthrie},
title = {Document classification by machine: theory and practice},
booktitle = {Proceedings of COLING-94, 15th International Conference on
Computational Linguistics},
publisher = {},
editor = {},
address = {Kyoto, {JP}},
year = {1994},
pages = {1059--1063},
url = {},
abstract = {},
}
@inCollection{Guthrie99,
author = {Louise Guthrie and Joe A. Guthrie and James Leistensnider},
title = {Document classification and routing},
booktitle = {Natural language information retrieval},
editor = {Tomek Strzalkowski},
year = {1999},
pages = {289--310},
publisher = {Kluwer Academic Publishers},
address = {Dordrecht, {NL}},
url = {},
abstract = {},
}
@inProceedings{Hadjarian01,
author = {Ali Hadjarian and Jerzy Bala and Peter Pachowicz},
title = {Text Categorization through Multistrategy Learning and
Visualization},
booktitle = {Proceedings of CICLING-01, 2nd International Conference on
Computational Linguistics and Intelligent Text Processing},
year = {2001},
editor = {Alexander Gelbukh},
publisher = {Springer Verlag, Heidelberg, {DE}},
address = {Mexico City, {ME}},
note = {Published in the ``Lecture Notes for Computer Science'' series,
number 2004},
pages = {423--436},
url = {http://link.springer.de/link/service/series/0558/papers/2004/20040437.pdf},
abstract = {This paper introduces a multistrategy learning approach to the
categorization of text documents. The approach benefits from two
existing, and in our view complimentary, sets of categorization
techniques: those based on Rocchio's algorithm and those
belonging to the rule learning class of machine learning
algorithms. Visualization is used for the presentation of the
output of learning},
}
@inProceedings{Hamill78,
author = {Hamill, Karen A. and Zamora, Antonio},
title = {An automatic document classification system using pattern
recognition techniques},
booktitle = {Proceedings of ASIS-78, 41st Annual Meeting of the American
Society for Information Science},
publisher = {American Society for Information Science, Washington, {US}},
editor = {Everett H. Brenner},
year = {1978},
address = {New York, {US}},
pages = {152--155},
url = {},
abstract = {},
}
@article{Hamill80,
author = {Hamill, Karen A. and Zamora, Antonio},
title = {The Use of titles for Automatic Document Classification},
journal = {Journal of the American Society for Information Science},
year = {1980},
number = {6},
pages = {396--402},
volume = {33},
url = {},
abstract = {},
}
@inProceedings{Han01,
author = {Eui-Hong Han and George Karypis and Vipin Kumar},
title = {Text Categorization Using Weight-Adjusted $k$-Nearest Neighbor
Classification},
booktitle = {Proceedings of PAKDD-01, 5th Pacific-Asia Conferenece on
Knowledge Discovery and Data Mining},
editor = {David Cheung and Qing Li and Graham Williams},
year = {2001},
publisher = {Springer Verlag, Heidelberg, {DE}},
address = {Hong Kong, {CN}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2035},
pages = {53--65},
url = {http://link.springer.de/link/service/series/0558/papers/2035/20350053.pdf},
abstract = {Text categorization presents unique challenges due to the large
number of attributes present in the data set, large number of
training samples, attribute dependency, and multi-modality of
categories. Existing classification techniques have limited
applicability in the data sets of these natures. In this paper,
we present a Weight Adjusted k-Nearest Neighbor (WAKNN)
classification that learns feature weights based on a greedy hill
climbing technique. We also present two performance optimizations
of WAKNN that improve the computational performance by a few
orders of magnitude, but do not compromise on the classification
quality. We experimentally evaluated WAKNN on 52 document data
sets from a variety of domains and compared its performance
against several classification algorithms, such as C4.5, RIPPER,
Naive-Bayesian, PEBLS and VSM. Experimental results on these data
sets confirm that WAKNN consistently outperforms other existing
classification algorithms.},
}
@article{Hanauer96,
author = {David Hanauer},
title = {Integration of phonetic and graphic features in poetic text
categorization judgements},
journal = {Poetics},
year = {1996},
volume = {23},
number = {5},
pages = {363--380},
url = {},
abstract = {The experiments reported in this paper deal with the relationship
between specific formal textual features, i.e. graphic and
phonetic information, and the reader's literary educational
background in the categorization of poetic texts. In two
experiments, the research method of Information Integration
Theory was employed in order to test two hypotheses relating to
the radical conventionalist and traditional positions on the role
of specific formal textual features in the categorization of
poetic texts. Twenty subjects from expert or novice literary
reading experience backgrounds were, in two experiments, required
to rate two parallel sets of graphically and phonetically
manipulated poems. The results reveal that subjects are sensitive
to the manipulations of graphic and phonetic information and use
the same additive information integration rule in making poetic
text categorization judgements. The expert literary readers were
found to assign significantly higher ratings to all versions of
the manipulated poems than the novice readers.},
}
@inProceedings{Hayes88,
author = {Philip J. Hayes and Laura E. Knecht and Monica J. Cellio},
title = {A news story categorization system},
booktitle = {Proceedings of ANLP-88, 2nd Conference on Applied Natural
Language Processing},
publisher = {Association for Computational Linguistics, Morristown, {US}},
address = {Austin, {US}},
editor = {},
year = {1988},
pages = {9--17},
url = {},
note = {Reprinted in Karen Sparck Jones and Peter Willett (eds.),
``Readings in Information Retrieval'', Morgan Kaufmann, San
Francisco, US, 1997, pp.\ 518--526.},
abstract = {The article describes a pilot version of a commercial application
of natural language processing techniques to the problem of
categorizing new stories into broad topic categories. The system
does not perform a complete semantic or syntactic analyses of the
input stories. Its categorizations are dependent on fragmentary
recognition using pattern-matching techniques. The fragments it
looks for are determined by a set of knowledge-based rules. The
accuracy of the system is only slightly lower than that of human
categorizers.},
}
@inProceedings{Hayes90a,
author = {Philip J. Hayes and Peggy M. Andersen and Irene B. Nirenburg and
Linda M. Schmandt},
title = {{\sc Tcs}: a shell for content-based text categorization},
booktitle = {Proceedings of CAIA-90, 6th IEEE Conference on Artificial
Intelligence Applications},
publisher = {{IEEE} Computer Society Press, Los Alamitos, {US}},
editor = {},
year = {1990},
address = {Santa Barbara, {US}},
pages = {320--326},
url = {},
abstract = {The kind of application that the text categorization shell, TCS,
can produce is characterized. Many of its applications have great
commercial value. The design goals for TCS are discussed, and
other approaches to text categorization in the light of these
goals are examined. The TCS and how it meets its design goals are
described, and examples of applications built with TCS are given.
A text-categorization application developed with TCS consists of
the TCS run-time system and a rule base. The rule base defines
what categories the application can assign to texts and contains
rules that make the categorization decisions for particular
texts. The data-driven nature of TCS allows it is to satisfy
fully the requirements of ease of application development,
portability to other applications and maintainability.},
}
@inProceedings{Hayes90,
author = {Philip J. Hayes and Steven P. Weinstein},
title = {{\sc Construe/Tis}: a system for content-based indexing of a
database of news stories},
booktitle = {Proceedings of IAAI-90, 2nd Conference on Innovative Applications
of Artificial Intelligence},
publisher = {{AAAI} Press, Menlo Park, {US}},
editor = {Alain Rappaport and Reid Smith},
year = {1990},
pages = {49--66},
url = {},
abstract = {},
}
@article{He03,
author = {Ji He and Ah-Hwee Tan and Chew-Lim Tan},
title = {On Machine Learning Methods for {C}hinese Document Categorization},
journal = {Applied Intelligence},
year = {2003},
volume = {18},
number = {3},
pages = {311--322},
url = {http://www.kluweronline.com/issn/0924-669X},
abstract = {This paper reports our comparative evaluation of three machine
learning methods, namely k Nearest Neighbor (kNN), SupportVector
Machines (SVM), and Adaptive Resonance Associative Map (ARAM) for
Chinese document categorization. Based on two Chinese corpora, a
series of controlled experiments evaluated their learning
capabilities and efficiency in mining text classification
knowledge. Benchmark experiments showed that their predictive
performance were roughly comparable, especially on clean and well
organized data sets. While kNN and ARAM yield better performances
than SVM on small and clean data sets, SVM and ARAM significantly
outperformed kNN on noisy data. Comparing efficiency, kNN was
notably more costly in terms of time and memory than the other
two methods. SVM is highly efficient in learning from well
organized samples of moderate size, although on relatively large
and noisy data the efficiency of SVM and ARAM are comparable.},
}
@article{Heaps73,
author = {H.S. Heaps},
title = {A theory of relevance for automatic document classification},
year = {1973},
journal = {Information and Control},
volume = {22},
number = {3},
pages = {268-278},
url = {},
abstract = {},
}
@inProceedings{Hearst91,
author = {Marti A. Hearst},
title = {Noun homograph disambiguation using local context in large
corpora},
booktitle = {Proceedings of the 7th Annual Conference of the University of
Waterloo Centre for the New Oxford English Dictionary},
publisher = {},
editor = {},
year = {1991},
pages = {1--22},
address = {Oxford, {UK}},
url = {ftp://parcftp.xerox.com/pub/hearst/oed91.ps.gz},
abstract = {This paper describes an accurate, relatively inexpensive method
for the disambiguation of noun homographs using large text
corpora. The algorithm checks the context surrounding the target
noun against that of previously observed instances and chooses
the sense for which the most evidence is found, where evidence
consists of a set of orthographic, syntactic, and lexical
features. Because the sense distinctions made are coarse, the
disambiguation can be accomplished without the expense of
knowledge bases or inference mechanisms. An implementation of the
algorithm is described which, starting with a small set of
hand-labeled instances, improves its results automatically via
unsupervised training. The approach is compared to other attempts
at homograph disambiguation using both machine readable
dictionaries and unrestricted text and the use of training
instances is determined to be a crucial difference.},
}
@proceedings{Hearst96a,
editor = {Marti A. Hearst and Haym Hirsh},
title = {Machine Learning in Information Access. Papers from the 1996 AAAI
Spring Symposium},
institution = {Americal Association for Artificial Intelligence},
address = {Stanford, {US}},
year = {1996},
note = {Available as Technical Report SS-96-05},
url = {},
abstract = {},
}
@inProceedings{Hersh94,
author = {William Hersh and Christopher Buckley and T.J. Leone and David
Hickman},
title = {{{\sc Ohsumed}}: an interactive retrieval evaluation and new
large text collection for research},
booktitle = {Proceedings of SIGIR-94, 17th ACM International Conference on
Research and Development in Information Retrieval},
editor = {W. Bruce Croft and Cornelis J. Van Rijsbergen},
publisher = {Springer Verlag, Heidelberg, {DE}},
address = {Dublin, {IE}},
pages = {192--201},
year = {1994},
url = {http://www.acm.org/pubs/articles/proceedings/ir/188490/p192-hersh/p192-hersh.pdf},
abstract = {A series of information retrieval experiments was carried out
with a computer installed in a medical practice setting for
relatively inexperienced physician end-users. Using a commercial
MEDLINE product based on the vector space model, these physicians
searched just as effectively as more experienced searchers using
Boolean searching. The results of this experiment were
subsequently used to create a new large medical test collection,
which was used in experiments with the SMART retrieval system to
obtain baseline performance data as well as compare SMART with
the other searchers.},
}
@inProceedings{Hoashi00,
author = {Keiichiro Hoashi and Kazunori Matsumoto and Naomi Inoue and Kazuo
Hashimoto},
title = {Document filtering methods using non-relevant information profile},
booktitle = {Proceedings of SIGIR-00, 23rd ACM International Conference on
Research and Development in Information Retrieval},
editor = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong},
publisher = {{ACM} Press, New York, {US}},
address = {Athens, {GR}},
year = {2000},
pages = {176--183},
url = {http://www.acm.org/pubs/articles/proceedings/ir/345508/p176-hoashi/p176-hoashi.pdf},
abstract = {Document filtering is a task to retrieve documents relevant to a
user's profile from a flow of documents. Generally, filtering
systems calculate the similarity between the profile and each
incoming document, and retrieve documents with similarity higher
than a threshold. However, many systems set a relatively high
threshold to reduce retrieval of non-relevant documents, which
results in the ignorance of many relevant documents. In this
paper, we propose the use of a non-relevant information profile
to reduce the mistaken retrieval of non-relevant documents.
Results from experiments show that this filter has successfully
rejected a sufficient number of non-relevant documents, resulting
in an improvement of filtering performance.},
}
@inProceedings{Hoch94,
author = {Rainer Hoch},
title = {Using {IR} techniques for text classification in document
analysis},
booktitle = {Proceedings of SIGIR-94, 17th ACM International Conference on
Research and Development in Information Retrieval},
editor = {W. Bruce Croft and Cornelis J. Van Rijsbergen},
publisher = {Springer Verlag, Heidelberg, {DE}},
year = {1994},
address = {Dublin, {IE}},
pages = {31--40},
url = {http://www.acm.org/pubs/articles/proceedings/ir/188490/p31-hoch/p31-hoch.pdf},
abstract = {This paper presents the INFOCLAS system applying statistical
methods of information retrieval for the classification of German
business letters into corresponding message types such as order,
offer, enclosure, etc. INFOCLAS is a first step towards the
understanding of documents proceeding to a classification-driven
extraction of information. The system is composed of two main
modules: the central indexer (extraction and weighting of
indexing terms) and the classifier (classification of business
letters into given types). The system employs several knowledge
sources including a letter database, word frequency statistics
for German, lists of message type specific words, morphological
knowledge as well as the underlying document structure. As
output, the system evaluates a set of weighted hypotheses about
the type of the actual letter. Classification of documents allow
the automatic distribution or archiving of letters and is also an
excellent starting point for higher-level document analysis.},
}
@article{Hoyle73,
author = {W.G. Hoyle},
title = {Automatic indexing and generation of classification by algorithm},
journal = {Information Storage and Retrieval},
year = {1973},
volume = {9},
number = {4},
pages = {233--242},
url = {},
abstract = {A system of automatic indexing based on Bayes' theorem is
described briefly. In assigning 124 documents to 9 categories,
there were 97 cases of agreement with professional indexers.
Using a collection factor, based on 87 per cent human consistency
from other courses, the computer appears then to index with 90
per cent accuracy in this case. The technique is then used with
two randomized sample document groups drawn from nine categories.
Each group in turn is used as the basis for indexing the other.
The computer knows only the number of categories. After 8 cycles
the computer is found to have formed 9 groups consisting of about
50 per cent of documents that were also lumped together by
professional indexers on the basis of subject content. A new
measure of performance is proposed and some other applications of
the technique indicated.},
}
@inProceedings{Hsu99,
author = {Wen-Lin Hsu and Sheau-Dong Lang},
title = {Classification algorithms for {NETNEWS} articles},
booktitle = {Proceedings of CIKM-99, 8th ACM International Conference on
Information and Knowledge Management},
publisher = {{ACM} Press, New York, {US}},
editor = {},
year = {1999},
address = {Kansas City, {US}},
pages = {114--121},
url = {http://www.acm.org/pubs/articles/proceedings/cikm/319950/p114-hsu/p114-hsu.pdf},
abstract = {We propose several algorithms using the vector space model to
classify the news articles posted on the NETNEWS according to the
newsgroup categories. The baseline method combines the terms of
all the articles of each newsgroup in the training set to
represent the newsgroups as single vectors. After training, the
incoming news articles are classified based on their similarity
to the existing newsgroup categories. We propose to use the
following techniques to improve the classification performance of
the baseline method: (1) use routing (classification) accuracy
and the similarity values to refine the training set; (2) update
the underlying term structures periodically during testing; and
(3) apply k-means clustering to partition the newsgroup articles
and represent each newsgroup by k vectors. Our test collection
consists of the real news articles and the 519 subnewsgroups
under the REC newsgroup of NETNEWS in a period of 3 months. Our
experimental results demonstrate that the technique of refining
the training set reduces from one-third to two-thirds of the
storage. The technique of periodical updates improves the routing
accuracy ranging from 20\% to 100\% but incurs runtime overhead.
Finally, representing each newsgroup by k vectors (with k = 2 or
3) using clustering yields the most significant improvement in
routing accuracy, ranging from 60\% to lOO\%, while causing only
slightly higher storage requirements.},
}
@inProceedings{Hsu99a,
author = {Wen-Lin Hsu and Sheau-Dong Lang},
title = {Feature Reduction and Database Maintenance in {NETNEWS}
Classification},
booktitle = {Proceedings of IDEAS-99, 1999 International Database Engineering
and Applications Symposium},
publisher = {{IEEE} Computer Society Press, Los Alamitos, {US}},
editor = {},
year = {1999},
address = {Montreal, {CA}},
pages = {137--144},
url = {http://dlib.computer.org/conferen/ideas/0265/pdf/02650137.pdf},
abstract = {We propose a statistical feature-reduction technique to filter
out the most ambiguous articles in the training data for
categorizing the NETNEWS articles. We also incorporate a batch
updating scheme to periodically do maintenance on the term
structures of the news database after training. The baseline
method combines the terms of all the articles of each newsgroup
in the training set to represent the newsgroups as single
vectors. After training, the incoming news articles are
classified based on their similarity to the existing newsgroup
categories. Our implementation uses an inverted file to store the
trained term structures of each newsgroup, and uses a list
similar to the inverted file to buffer the newly arrival
articles, for efficient routing and updating purposes. Our
experimental results using real NETNEWS articles and newsgroups
demonstrate (1) applying feature reduction to the training set
improves the routing accuracy, efficiency, and database storage;
(2) updating improves the routing accuracy; and (3) the batch
technique improves the efficiency of the updating operation.},
}
@inProceedings{Huffman94,
author = {Stephen Huffman and Marc Damashek},
title = {Acquaintance: A Novel Vector-Space N-Gram Technique for Document
Categorization},
booktitle = {Proceedings of TREC-3, 3rd Text Retrieval Conference},
publisher = {National Institute of Standards and Technology, Gaithersburg, {US}},
editor = {Donna K. Harman},
year = {1994},
address = {Gaithersburg, {US}},
pages = {305--310},
url = {},
abstract = {Acquaintance is the name of a novel vector-space n-gram technique
for categorizing documents. The technique is completely
language-independent, highly garble-resistant, and
computationally simple. An unoptimized version of the algorithm
was used to process the TREC database in a very short time.},
}
@inProceedings{Huffman95,
author = {Stephen Huffman},
title = {Acquaintance: Language-Independent Document Categorization by
N-Grams},
booktitle = {Proceedings of TREC-4, 4th Text Retrieval Conference},
publisher = {National Institute of Standards and Technology, Gaithersburg, {US}},
editor = {Donna K. Harman and Ellen M. Voorhees},
year = {1995},
address = {Gaithersburg, {US}},
pages = {359--371},
url = {http://trec.nist.gov/pubs/trec4/papers/nsa.ps.gz},
abstract = {Acquaintance is the name of a novel vector-space n-gram for
categorizing documents. The technique is completely
language-independent, highly garble-resistant, and
computationally simple. An unoptimized version of the algorithm
was used to process the TREC database in a very short time. The
TREC-3 conference provided the first public demonstration and
evaluation of this new technique, and TREC-4 provided an
opportunity to test its usefulness on several types of text
retrieval tasks.},
}
@inProceedings{Hull94,
author = {Hull, David A.},
title = {Improving text retrieval for the routing problem using latent
semantic indexing},
booktitle = {Proceedings of SIGIR-94, 17th ACM International Conference on
Research and Development in Information Retrieval},
editor = {W. Bruce Croft and Cornelis J. Van Rijsbergen},
publisher = {Springer Verlag, Heidelberg, {DE}},
year = {1994},
address = {Dublin, {IE}},
pages = {282--289},
url = {http://www.acm.org/pubs/articles/proceedings/ir/188490/p282-hull/p282-hull.pdf},
abstract = {Latent Semantic Indexing (LSI) is a novel approach to information
retrieval that attempts to model the underlying structure of term
associations by transforming the traditional representation of
documents as vectors of weighted term frequencies to a new
coordinate space where both documents and terms are represented
as linear combinations of underlying semantic factors. In
previous research, LSI has produced a small improvement in
retrieval performance. In this paper, we apply LSI to the routing
task, which operates under the assumption that a sample of
relevant and non-relevant documents is available to use in
constructing the query. Once again, LSI slightly improves
performance. However, when LSI is used is conduction with
statistical classification, there is a dramatic improvement in
performance.},
}
@inProceedings{Hull96,
author = {David A. Hull and Jan O. Pedersen and Hinrich Sch{\"u}tze},
title = {Method combination for document filtering},
booktitle = {Proceedings of SIGIR-96, 19th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Hans-Peter Frei and Donna Harman and Peter Sch{\"{a}}uble and
Ross Wilkinson},
publisher = {{ACM} Press, New York, {US}},
year = {1996},
address = {Z{\"{u}}rich, {CH}},
pages = {279--288},
url = {ftp://parcftp.xerox.com/pub/qca/papers/sigirfiltering96.ps},
abstract = {There is strong empirical and theoretic evidence that combination
of retrieval methods can improve performance. In this paper, we
systematically compare combination strategies in the context of
document filtering, using queries from the Tipster reference
corpus. We find that simple averaging strategies do indeed
improve performance, but that direct averaging of probability
estimates is not the correct approach. Instead, the probability
estimates must be renormalized using logistic regression on the
known relevance judgements. We examine more complex combination
strategies but find them less successful due to the high
correlations among our filtering methods which are optimized over
the same training data and employ similar document
representations.},
}
@inProceedings{Hull98,
author = {David A. Hull},
title = {The {TREC-7} filtering track: description and analysis},
booktitle = {Proceedings of TREC-7, 7th Text Retrieval Conference},
publisher = {National Institute of Standards and Technology, Gaithersburg, {US}},
editor = {Ellen M. Voorhees and Donna K. Harman},
year = {1998},
address = {Gaithersburg, {US}},
pages = {33--56},
url = {http://trec.nist.gov/pubs/trec7/papers/tr7filter/paper.ps},
abstract = {This article describes the experiments conducted in the TREC-7
filtering track, which consisted of three subtasks: adaptive
filtering, batch filtering, and routing. The focus this YEAR is
on adaptive filtering, where the system begins with only the
topic statement and must interactively adjust a filtering profile
constructed from that topic in response to on-line feedback. In
addition to motivating the task and describing the practical
details of participating in the track, this document includes a
detailed graphical presentation of the experimental results and
provides a brief overall analysis of the performance data.},
}
@inProceedings{Ipeirotis01,
author = {Panagiotis G. Ipeirotis and Luis Gravano and Mehran Sahami},
title = {Probe, count, and classify: categorizing hidden {W}eb databases},
booktitle = {Proceedings of SIGMOD-01, ACM International Conference on
Management of Data},
editor = {Walid G. Aref},
publisher = {{ACM} Press, New York, {US}},
year = {2001},
address = {Santa Barbara, {US}},
pages = {67--78},
url = {http://doi.acm.org/10.1145/375663.375671},
abstract = {The contents of many valuable web-accessible databases are only
accessible through search interfaces and are hence in-visible to
traditional web ``crawlers''. Recent studies have estimated the
size of this ''hidden web'' to be 500 billion pages, while the
size of the ``crawlable'' web is only an es-timated two billion
pages. Recently, commercial web sites have started to manually
organize web-accessible databases into Yahoo!-like hierarchical
classification schemes. In this paper, we introduce a method for
automating this classi-fication process by using a small number
of query probes. To classify a database, our algorithm does not
retrieve or in-spect any documents or pages from the database,
but rather just exploits the number of matches that each query
probe generates at the database in question. We have conducted an
extensive experimental evaluation of our technique over
collections of real documents, including over one hundred
web-accessible databases. Our experiments show that our system
has low overhead and achieves high classification ac-curacy
across a variety of databases.},
}
@inProceedings{Ittner95,
author = {David J. Ittner and Lewis, David D. and David D. Ahn},
title = {Text categorization of low quality images},
booktitle = {Proceedings of SDAIR-95, 4th Annual Symposium on Document
Analysis and Information Retrieval},
publisher = {},
editor = {},
year = {1995},
address = {Las Vegas, {US}},
pages = {301--315},
url = {http://www.research.att.com/~lewis/papers/ittner95.ps},
abstract = {Categorization of text images into content-oriented classes would
be a useful capability in a variety of document handling systems.
Many methods can be used to categorize texts once their words are
known, but OCR can garble a large proportion of words,
particularly when low quality images are used. Despite this, we
show for one data set that fax quality images can be categorized
with nearly the same accuracy as the original text. Further, the
categorization system can be trained on noisy OCR output, without
need for the true text of any image, or for editing of OCR
output. The use of a vector space classifier and training method
robust to large feature sets, combined with discarding of low
frequency OCR output strings are the key to our approach.},
}
@inProceedings{Iwayama94,
author = {Makoto Iwayama and Takenobu Tokunaga},
title = {A Probabilistic Model for Text Categorization: Based on a Single
Random Variable with Multiple Values},
booktitle = {Proceedings of ANLP-94, 4th Conference on Applied Natural
Language Processing},
publisher = {Association for Computational Linguistics, Morristown, {US}},
editor = {},
year = {1994},
address = {Stuttgart, {DE}},
pages = {162--167},
url = {},
abstract = {},
}
@inProceedings{Iwayama95,
author = {Makoto Iwayama and Takenobu Tokunaga},
title = {Cluster-based text categorization: a comparison of category
search strategies},
booktitle = {Proceedings of SIGIR-95, 18th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Edward A. Fox and Peter Ingwersen and Raya Fidel},
publisher = {{ACM} Press, New York, {US}},
year = {1995},
address = {Seattle, {US}},
pages = {273--281},
url = {http://www.acm.org/pubs/articles/proceedings/ir/215206/p273-iwayama/p273-iwayama.pdf},
abstract = {Text categorization can be viewed as a process of category
search, in which one or more categories for a test document are
searched for by using given training documents with known
categories. A cluster based search with a probabilistic
clustering algorithm is proposed and evaluated on two data sets.
The efficiency, effectiveness, and noise tolerance of this search
strategy were confirmed to be better than those of a full search,
a category based search, and a cluster based search with
nonprobabilistic clustering.},
}
@inProceedings{Iwayama95a,
author = {Makoto Iwayama and Takenobu Tokunaga},
title = {Hierarchical {B}ayesian clustering for automatic text
classification},
booktitle = {Proceedings of IJCAI-95, 14th International Joint Conference on
Artificial Intelligence},
editor = {Chris E. Mellish},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
year = {1995},
address = {Montreal, {CA}},
pages = {1322--1327},
url = {},
abstract = {Text classification, the grouping of texts into several clusters,
has been used as a means of improving both the efficiency and the
effectiveness of text retrieval/categorization. In this paper we
propose a hierarchical clustering algorithm that constructs a set
of clusters having the maximum Bayesian posterior probability,
the probability that the given texts are classified into
clusters. We call the algorithm Hierarchical Bayesian Clustering
(HBC). The advantages of HBC are experimentally verified from
several viewpoints. HBC can reconstruct the original clusters
more accurately than other non-probabilistic algorithms. When a
probabilistic text categorization is extended to a cluster-based
one, the use of HBC offers better performance than the use of
non-probabilistic algorithms.},
}
@inProceedings{Iwazume96,
author = {Michiaki Iwazume and Hideaki Takeda and Toyoaki Nishida},
title = {Ontology-Based Information Gathering and Text Categorization from
the {I}nternet},
booktitle = {Proceedings of IEA/AIE-96, 9th International Conference in
Industrial and Engineering Applications of Artificial
Intelligence and Expert Systems},
editor = {},
publisher = {},
year = {1996},
address = {Fukuoka, {JP}},
pages = {305--314},
url = {},
abstract = {},
}
@inProceedings{Iyer00,
author = {Raj D. Iyer and David D. Lewis and Robert E. Schapire and Yoram
Singer and Amit Singhal},
title = {Boosting for Document Routing},
booktitle = {Proceedings of CIKM-00, 9th ACM International Conference on
Information and Knowledge Management},
publisher = {{ACM} Press, New York, {US}},
address = {McLean, {US}},
editor = {Arvin Agah and Jamie Callan and Elke Rundensteiner},
year = {2000},
pages = {70--77},
url = {http://www.cs.huji.ac.il/~singer/papers/rankboost.ps.gz},
abstract = {RankBoost is a recently proposed algorithm for learning ranking
functions. It is simple to implement and has strong
justifications from computational learning theory. We describe
the algorithm and present experimental results on applying it to
the document routing problem. The first set of results applies
RankBoost to a text representation produced using modern term
weighting meth-ods. Performance of RankBoost is somewhat inferior
to that of a state-of-the-art routing algorithm which is,
however, more com-plex and less theoretically justified than
RankBoost. RankBoost achieves comparable performance to the
state-of-the-art algorithm when combined with feature or example
selection heuristics. Our second set of results examines the
behavior of RankBoost when it has to learn not only a ranking
function but also all aspects of term weighting from raw data.
Performance is usually, though not always, less good here, but
the term weighting functions implicit in the resulting ranking
functions are intriguing, and the approach could easily be
adapted to mixtures of textual and nontextual data.},
}
@inProceedings{Jacobs92,
author = {Paul S. Jacobs},
title = {Joining statistics with {NLP} for text categorization},
booktitle = {Proceedings of ANLP-92, 3rd Conference on Applied Natural
Language Processing},
publisher = {Association for Computational Linguistics, Morristown, {US}},
editor = {Marcia Bates and Oliviero Stock},
year = {1992},
address = {Trento, {IT}},
pages = {178--185},
url = {},
abstract = {Automatic news categorization systems have produced high
accuracy, consistency, and flexibility using some natural
language processing techniques. These knowledge-based
categorization methods are more powerful and accurate than
statistical techniques. However, the phrasal pre-processing and
pattern matching methods that seem to work for categorization
have the disadvantage of requiring a fair amount of
knowledge-encoding by human beings. In addition, they work much
better at certain tasks, such as identifying major events in
texts, than at others, such as determining what sort of business
or product is involved in a news event. Statistical methods for
categorization, on the other hand, are easy to implement and
require little or no human customization. But they don't offer
any of the benefits of natural language processing, such as the
ability to identify relationships and enforce linguistic
constraints. The authors' approach has been to use statistics in
the knowledge acquisition component of a linguistic pattern-based
categorization system, using statistical methods, for example, to
associate words with industries and identify phrases that
information about businesses or products. Instead of replacing
knowledge-based methods with statistics, statistical training
replaces knowledge engineering. This has resulted in high
accuracy, shorter customization time, and good prospects for the
application of the statistical methods to problems in lexical
acquisition.},
}
@article{Jacobs93,
author = {Paul S. Jacobs},
title = {Using Statistical Methods to Improve Knowledge-Based News
Categorization},
journal = {{IEEE} Expert},
year = {1993},
number = {2},
volume = {8},
pages = {13--23},
url = {},
abstract = {},
}
@inProceedings{Jo99,
author = {Taeho C. Jo},
title = {Text categorization with the concept of fuzzy set of informative
keywords},
booktitle = {Proceedings of FUZZ-IEEE'99, IEEE International Conference on
Fuzzy Systems},
editor = {},
publisher = {{IEEE} Computer Society Press, Los Alamitos, {US}},
address = {Seoul, {KR}},
pages = {609--614},
year = {1999},
url = {},
abstract = {Text categorization is the procedure of assigning a category to a
particular document among predefined categories. Informative
keywords are the ones which reflect the contents of a document. A
document includes informative keywords and non-informative
keywords. Mainly non-informative keywords play the roles of
grammatical functions in sentences; such keywords, what are
called functional keywords, reflect its contents very little, so
they should be removed in the process of document indexing. The
discrimination between informative keywords and functional
keywords is not crisp. In the process of document indexing, a
document is represented as a set of informative keywords. In this
paper, it is proposed that a document be represented into a fuzzy
set of informative keywords, instead of a crisp set of
informative keywords. The experiments of the categorization of
news articles show that the proposed schemes of text
categorization outperform the schemes with crisp sets.},
}
@inCollection{Jo99a,
author = {Taeho C. Jo},
title = {News article classification based on categorical points from
keywords in backdata},
booktitle = {Computational Intelligence for Modelling, Control and Automation},
editor = {Masoud Mohammadian},
publisher = {{IOS} Press},
address = {Amsterdam, {NL}},
pages = {211--214},
year = {1999},
url = {},
abstract = {A scheme of automatic document classification is presented.
Previously, documents have been classified according to their
contents manually. Therefore, it is very costly to assign a
category to them because a human investigates their contents. As
the amount of data stored in storage media is increased
exponentially, it becomes necessary to store documents according
to their category, to access them easily. Automatic text
classification is needed to store documents like that. Before
performing text classification, back data should be constructed.
The back data stores the information about keywords: the
frequency for each category, the number of documents for each
category. A document is represented with a list of keywords.
Categorical points to each category are computed by summing the
frequency of each keyword from back data, or the number of
documents from it. The category that contains the largest
categorical points is selected as the category of a document. In
the results of an experiment with news article classification,
precision is about 98\%.},
}
@inCollection{Jo99b,
author = {Taeho C. Jo},
title = {News articles classification based on representative keywords of
categories},
booktitle = {Computational Intelligence for Modelling, Control and Automation},
editor = {Masoud Mohammadian},
publisher = {{IOS} Press},
address = {Amsterdam, {NL}},
pages = {194--198},
year = {1999},
url = {},
abstract = {A scheme of automatic document classification is presented. So
far, documents have been classified according to their contents
manually. Therefore, it is very costly to assign a category for
them because humans investigate their contents. As the amount of
data stored in storage media is increased exponentially, it
becomes necessary to store documents according to their category,
to access them easily. Automatic text classification is necessary
to store documents like that. The scheme for automatic text
classification proposed in the paper, is based on document
indexing, where a document is represented as a list of keywords.
The number of common keywords between keywords from the document
itself and representative keywords from back data classifies
documents. As an example, the proposed scheme is applied to the
classification of news articles into 3 categories: politics,
sports, and business. The measurements of performance evaluation
are: classification rate, correctness rate, and classified
correctness rate.},
}
@inProceedings{Joachims97,
author = {Thorsten Joachims},
title = {A probabilistic analysis of the {R}occhio algorithm with {TFIDF}
for text categorization},
booktitle = {Proceedings of ICML-97, 14th International Conference on Machine
Learning},
editor = {Douglas H. Fisher},
year = {1997},
address = {Nashville, {US}},
pages = {143--151},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {http://www-ai.cs.uni-dortmund.de/DOKUMENTE/joachims_97a.ps.gz},
abstract = {The Rocchio relevance feedback algorithm is one of the most
popular and widely applied learning methods from information
retrieval. Here, a probabilistic analysis of this algorithm is
presented in a text categorization framework. The analysis gives
theoretical insight into the heuristics used in the Roc-chio
algorithm, particularly the word weighting scheme and the
similarity metric. It also suggests improvements which lead to a
probabilistic variant of the Rocchio classifier. The Rocchio
classifier, its probabilistic variant, and a naive Bayes
classifier are compared on six text categorization tasks. The
results show that the probabilistic algorithms are preferable to
the heuristic Rocchio classifier not only because they are more
well-founded, but also because they achieve better performance.},
}
@inProceedings{Joachims97b,
author = {Thorsten Joachims and Dayne Freitag and Tom M. Mitchell},
title = {{\sc WebWatcher}: a tour guide for the {W}ord {W}ide {W}eb},
booktitle = {Proceedings of IJCAI-97, 15th International Joint Conference on
Artificial Intelligence},
editor = {Martha E. Pollack},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
year = {1997},
address = {Nagoya, {JP}},
pages = {770--775},
url = {http://www.cs.cmu.edu/afs/cs/user/dayne/www/ps/ijcai97.ps.Z},
abstract = {We describe WebWatcher as a tour guide agent for the web, the
learning algorithms used by WebWatcher, experimental results
based on learning from thousands of users, and lessons learned
from this case study of tour guide agents.},
}
@inProceedings{Joachims98,
author = {Thorsten Joachims},
title = {Text categorization with support vector machines: learning with
many relevant features},
booktitle = {Proceedings of ECML-98, 10th European Conference on Machine
Learning},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 1398},
editor = {Claire N{\'{e}}dellec and C{\'{e}}line Rouveirol},
address = {Chemnitz, {DE}},
pages = {137--142},
year = {1998},
url = {http://www-ai.cs.uni-dortmund.de/DOKUMENTE/joachims_98a.ps.gz},
abstract = {The paper explores the use of Support Vector Machines (SVMs) for
learning text classifiers from examples. It analyzes the
particular properties of learning with text data and identifies
why SVMs are appropriate for this task. Empirical results support
the theoretical findings. SVMs achieve substantial improvements
over the currently best performing methods and behave robustly
over a variety of different learning tasks. Furthermore, they are
fully automatic, eliminating the need for manual parameter
tuning.},
}
@inProceedings{Joachims99,
author = {Thorsten Joachims},
title = {Transductive Inference for Text Classification using Support
Vector Machines},
booktitle = {Proceedings of ICML-99, 16th International Conference on Machine
Learning},
editor = {Ivan Bratko and Saso Dzeroski},
year = {1999},
address = {Bled, {SL}},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
pages = {200--209},
url = {http://www-ai.cs.uni-dortmund.de/DOKUMENTE/joachims_99c.ps.gz},
abstract = {This paper introduces transductive support vector machines
(TSVMs) for text classification. While regular support vector
machines (SVMs) try to induce a general decision function for a
learning task, TSVMs take into account a particular test set and
try to minimize misclassifications of just those particular
examples. The paper presents an analysis of why TSVMs are well
suited for text classification. These theoretical findings are
supported by experiments on three test collections. The
experiments show substantial improvements over inductive methods,
especially for small training sets, cutting the number of labeled
training examples down to a 20th on some tasks. This work also
proposes an algorithm for training TSVMs efficiently, handling
10,000 examples and more.},
}
@inProceedings{Joachims00,
author = {Thorsten Joachims},
title = {Estimating the Generalization Performance of a {SVM} Efficiently},
booktitle = {Proceedings of ICML-00, 17th International Conference on Machine
Learning},
editor = {Pat Langley},
year = {2000},
address = {Stanford, {US}},
pages = {431--438},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {http://www-ai.cs.uni-dortmund.de/DOKUMENTE/joachims_00a.pdf},
abstract = {This paper proposes and analyzes an efficient and effective
approach for estimating the generalization performance of a
support vector machine (SVM) for text classification. Without any
computation-intensive resampling, the new estimators are
computationally much more efficient than cross-validation or
bootstrapping. They can be computed at essentially no extra cost
immediately after training a single SVM. Moreover, the estimators
developed here address the special performance measures needed
for evaluating text classifiers. They can be used not only to
estimate the error rate, but also to estimate recall, precision,
and F1. A theoretical analysis and experiments show that the new
method can effectively estimate the performance of SVM text
classifiers in an efficient way.},
}
@inProceedings{Joachims01c,
author = {Thorsten Joachims},
title = {A Statistical Learning Model of Text Classification with Support
Vector Machines},
booktitle = {Proceedings of SIGIR-01, 24th ACM International Conference on
Research and Development in Information Retrieval},
editor = {W. Bruce Croft and David J. Harper and Donald H. Kraft and Justin
Zobel},
publisher = {{ACM} Press, New York, {US}},
address = {New Orleans, {US}},
year = {2001},
pages = {128--136},
url = {http://www.cs.cornell.edu/People/tj/publications/joachims_01a.pdf},
abstract = {This paper develops a theoretical learning model of text
classification for Support Vector Machines (SVMs). It connects
the statistical properties of text-classification tasks with the
generalization performance of a SVM in a quantitative way. Unlike
conventional approaches to learning text classifiers, which rely
primarily on empirical evidence, this model explains why and when
SVMs perform well for text classification. In particular, it
addresses the following questions: Why can support vector
machines handle the large feature spaces in text classification
effectively? How is this related to the statistical properties of
text? What are sufficient conditions for applying SVMs to
text-classification problems successfully?},
}
@inProceedings{Joachims01b,
author = {Thorsten Joachims and Nello Cristianini and John Shawe-Taylor},
title = {Composite Kernels for Hypertext Categorisation},
booktitle = {Proceedings of ICML-01, 18th International Conference on Machine
Learning},
editor = {Carla Brodley and Andrea Danyluk},
address = {Williams College, {US}},
year = {2001},
pages = {250--257},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {http://www.cs.cornell.edu/People/tj/publications/joachims_etal_01a.pdf},
abstract = {Kernels are problem-specific functions that act as an interface
between the learning system and the data. While it is well-known
when the combination of two kernels is again a valid kernel, it
is an open question if the resulting kernel will perform well. In
particular, in which situations can a combination of kernel be
expected to perform better than its components considered
separately? Intuitively, one would like each of the two kernels
to contribute information that is not available to the other.
This characterization hence must consider the data at hand, both
the kernels and also the task, that is the information given by
the labels. We investigate this problem by looking at the task of
designing kernels for hypertext classification, where both words
and links information can be exploited. Firstly we introduce a
novel kernel, whose Gram matrix is the well known co-citation
matrix from bibliometrics, and demonstrate on real data that it
has a good performance. Then we study the problem of combining it
with a standard bag of words kernel. We provide sufficient
conditions that indicate when an improvement can be expected,
highlighting and formalising the notion of ``independent
kernels''. Experimental results confirm the predictions of the
theory in the hypertext domain.},
}
@book{Joachims02a,
author = {Thorsten Joachims},
title = {Learning to Classify Text using Support Vector Machines},
publisher = {Kluwer Academic Publishers},
address = {Dordrecht, {NL}},
year = {2002},
}
@article{Joachims02,
author = {Thorsten Joachims and Fabrizio Sebastiani},
title = {Guest editors' introduction to the special issue on automated
text categorization},
journal = {Journal of Intelligent Information Systems},
year = {2002},
note = {Special Issue on Automated Text Categorization},
volume = {18},
number = {2/3},
pages = {103--105},
url = {http://www.wkap.nl/article.pdf?391241},
}
@article{Juan02,
author = {Juan, Alfons and Vidal, Enrique},
title = {On the use of {B}ernoulli mixture models for text classification},
journal = {Pattern Recognition},
year = {2002},
volume = {35},
number = {12},
pages = {2705--2710},
url = {},
abstract = {Mixture modelling of class-conditional densities is a standard
pattern recognition technique. Although most research on mixture
models has concentrated on mixtures for continuous data, emerging
pattern recognition applications demand extending research
efforts to other data types. This paper focuses on the
application of mixtures of multivariate Bernoulli distributions
to binary data. More concretely, a text classification task aimed
at improving language modelling for machine translation is
considered.},
}
@inProceedings{Junker97,
author = {Markus Junker and Andreas Abecker},
title = {Exploiting Thesaurus Knowledge in Rule Induction for Text
Classification},
booktitle = {Proceedings of RANLP-97, 2nd International Conference on Recent
Advances in Natural Language Processing},
publisher = {},
editor = {Ruslan Milkov and Nicolas Nicolov and Nilokai Nikolov},
address = {Tzigov Chark, {BL}},
pages = {202--207},
year = {1997},
url = {http://www.dfki.uni-kl.de/~junker/download/ranlp97.ps},
abstract = {Systems for learning text classifiers recently gained
considerable interest. One technique to implement such systems is
rule induction. While most other approaches rely on a relatively
simple document representation and do not make use of any
background knowledge, rule induction algorithms offer a good
potential for improvements in both of these areas. In this paper,
we show how an operator-based view of rule induction enables the
easy integration of a thesaurus as background knowledge. Results
with an algorithm extended by thesaurus knowledge are presented
and interpreted. The interpretation shows the strengths and
weaknesses of using thesaurus knowledge and gives hints for
future research.},
}
@article{Junker98,
author = {Markus Junker and Rainer Hoch},
title = {An experimental evaluation of {OCR} text representations for
learning document classifiers},
journal = {International Journal on Document Analysis and Recognition},
pages = {116--122},
year = {1998},
number = {2},
volume = {1},
url = {http://link.springer.de/link/service/journals/10032/papers/8001002/80010116.ps.gz},
abstract = {In the literature, many feature types are proposed for document
classification. However, an extensive and systematic evaluation
of the various approaches has not yet been done. In particular,
evaluations on OCR documents are very rare. In this paper we
investigate seven text representations based on n-grams and
single words. We compare their effectiveness in classifying OCR
texts and the corresponding correct ASCII texts in two domains:
business letters and abstracts of technical reports. Our results
indicate that the use of n-grams is an attractive technique which
can even compare to techniques relying on a morphological
analysis. This holds for OCR texts as well as for correct ASCII
texts.},
}
@inProceedings{Junker00,
author = {Markus Junker and Michaell Sintek and Matthias Rinck},
title = {Learning for text categorization and information extraction with
ILP},
booktitle = {Proceedings of the 1st Workshop on Learning Language in Logic},
editor = {Cussens, James and Saso Dzeroski},
year = {2000},
address = {Bled, {SL}},
pages = {247--258},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 1925},
url = {},
abstract = {Text categorization (TC) and information extraction (IE) are two
important goals of natural language processing. While
hand-crafting rules for both tasks has a long tradition, learning
approaches used to gain much interest in the past. Since in both
tasks text as a sequence of words is of crucial importance,
propositional learners have strong limitations, Although viewing
learning for TC and IE as inductive logic programming (ILP)
problems is obvious, most approaches rather use proprietary
formalisms. In this paper, we provide a solid basis for the
application of ILP methods to these learning problems. We
introduce three basic types (namely a type for text, one for
words and one for positions in texts) and three simple predicate
definitions over these types which enable us to write TC and IE
rules as logic programs. Based on the proposed representation, we
present an approach to the problem of learning rules for TC and
IE in terms of ILP. We conclude by comparing our approach of
representing texts and rules as logic programs to others.},
}
@inProceedings{Junker01,
author = {Markus Junker and Andreas Dengel},
title = {Preventing Overfitting in Learning Text Patterns for Document
Categorization},
booktitle = {Proceedings of ICAPR-01, 2nd International Conference on Advances
in Pattern Recognition},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2013},
editor = {Sameer Singh and Nabeel A. Murshed and Walter Kropatsch},
address = {Rio De Janeiro, {BR}},
year = {2001},
pages = {137--146},
url = {http://link.springer.de/link/service/series/0558/papers/2013/20130137.pdf},
abstract = {There is an increasing interest in categorizing texts using
learning algorithms. While the majority of approaches rely on
learning linear classifiers, there is also some interest in
describing document categories by text patterns. We introduce a
model for learning patterns for text categorization (the
LPT-model) that does not rely on an attribute-value
representation of documents but represents documents essentially
"as they are". Based on the LPT-model, we focus on learning
patterns within a relatively simple pattern language. We compare
different search heuristics and pruning methods known from
various symbolic rule learners on a set of representative text
categorization problems. The best results were obtained using the
m-estimate as search heuristics combined with the
likelihood-ratio-statics for pruning. Even better results can be
obtained, when replacing the likelihood-ratio-statics by a new
measure for pruning; this we call l-measure. In contrast to
conventional measures for pruning, the l-measure takes into
account properties of the search space.},
}
@article{Kaban02,
author = {Ata Kaban and Mark Girolami},
title = {A Dynamic Probabilistic Model to Visualise Topic Evolution in
Text Streams},
journal = {Journal of Intelligent Information Systems},
year = {2002},
note = {Special Issue on Automated Text Categorization},
volume = {18},
number = {2/3},
pages = {107--125},
url = {http://www.wkap.nl/article.pdf?391242},
abstract = {We propose a novel probabilistic method, based on latent variable
models, for unsupervised topographic visualisation of dynamically
evolving, coherent textual information. This can be seen as a
complementary tool for topic detection and tracking applications.
This is achieved by the exploitation of the a priori domain
knowledge available, that there are relatively homogeneous
temporal segments in the data stream. In a different manner from
topographical techniques previously utilized for static text
collections, the topography is an outcome of the coherence in
time of the data stream in the proposed model. Simulation results
on both toy-data settings and an actual application on Internet
chat line discussion analysis is presented by way of
demonstration.},
}
@article{Kar78,
author = {Gautam Kar and Lee J. White},
title = {A distance measure for automated document classification by
sequential analysis},
journal = {Information Processing and Management},
pages = {57--69},
year = {1978},
number = {2},
volume = {14},
url = {},
abstract = {},
}
@inProceedings{Karypis00,
author = {George Karypis and Eui-Hong Han},
title = {Fast Supervised Dimensionality Reduction Algorithm with
Applications to Document Categorization and Retrieval},
booktitle = {Proceedings of CIKM-00, 9th ACM International Conference on
Information and Knowledge Management},
publisher = {{ACM} Press, New York, {US}},
address = {McLean, {US}},
editor = {Arvin Agah and Jamie Callan and Elke Rundensteiner},
year = {2000},
pages = {12--19},
url = {ftp://ftp.cs.umn.edu/dept/users/kumar/cikm-ci.ps},
abstract = {Retrieval techniques based on dimensionality reduction, such as
Latent Semantic Indexing (LSI), have been shown to improve the
quality of the information being retrieved by capturing the
latent meaning of the words present in the documents.
Unfortunately, the high computational and memory requirements of
LSI and its inability to compute an effective dimensionality
reduction in a supervised setting limits its applicability. In
this paper we present a fast supervised dimensionality reduction
algorithm that is derived from the recently developed
cluster-based unsupervised dimensionality reduction algorithms.
We experimentally evaluate the quality of the lower dimensional
spaces both in the context of document categorization and
improvements in retrieval performance on a variety of different
document collections. Our experiments show that the lower
dimensional spaces computed by our algorithm consistently improve
the performance of traditional algorithms such as C4.5,
k-nearest-neighbor, and Support Vector Machines (SVM), by an
average of 2\% to 7\%. Furthermore, the supervised lower
dimensional space greatly improves the retrieval performance when
compared to LSI.},
}
@inProceedings{Kawatani02,
author = {Takahiko Kawatani},
title = {Topic Difference Factor Extraction between Two Document Sets and
its Application to Text Categorization},
booktitle = {Proceedings of SIGIR-02, 25th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng
and Kalervo J{\"{a}}rvelin},
publisher = {{ACM} Press, New York, {US}},
address = {Tampere, {FI}},
year = {2002},
pages = {137--144},
url = {http://doi.acm.org/10.1145/564376.564402},
abstract = {To improve performance in text categorization, it is important to
extract distinctive features for each class. This paper proposes
topic difference factor analysis (TDFA) as a method to extract
projection axes that reflect topic differences between two
document sets. Suppose all sentence vectors that compose each
document are projected onto projection axes. TDFA obtains the
axes that maximize the ratio between the document sets as to the
sum of squared projections by solving a generalized eigenvalue
problem. The axes are called topic difference factors (TDF's). By
applying TDFA to the document set that belongs to a given class
and a set of documents that is misclassified as belonging to that
class by an existent classifier, we can obtain features that take
large values in the given class but small ones in other classes,
as well as features that take large values in other classes but
small ones in the given class. A classifier was constructed
applying the above features to complement the kNN classifier. As
the results, the micro averaged F1 measure for Reuters-21578
improved from 83.69 to 87.27\%.},
}
@inProceedings{Kessler97,
author = {Brett Kessler and Geoff Nunberg and Hinrich Sch{\"{u}}tze},
title = {Automatic detection of text genre},
booktitle = {Proceedings of ACL-97, 35th Annual Meeting of the Association for
Computational Linguistics},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
editor = {Philip R. Cohen and Wolfgang Wahlster},
year = {1997},
address = {Madrid, {ES}},
pages = {32--38},
url = {ftp://parcftp.xerox.com/pub/qca/genre/paper.acl97.ps.Z},
abstract = {As the text databases available to users become larger and more
heterogeneous, genre becomes increasingly important for
computational linguistics as a complement to topical and
structural principles of classification. We propose a theory of
genres as bundles of facets, which correlate with various surface
cues, and argue that genre detection based on surface cues is as
successful as detection based on deeper structural properties.},
}
@inProceedings{Khmelev03,
author = {Dmitry V. Khmelev and William J. Teahan},
title = {A repetition based measure for verification of text collections
and for text categorization},
booktitle = {Proceedings of SIGIR-03, 26th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Jamie Callan and Gordon Cormack and Charles Clarke and David
Hawking and Alan Smeaton},
publisher = {{ACM} Press, New York, {US}},
address = {Toronto, {CA}},
year = {2003},
pages = {104--110},
url = {http://doi.acm.org/10.1145/860435.860456},
abstract = {We suggest a way for locating duplicates and plagiarisms in a
text collection using an R-measure, which is the normalized sum
of the lengths of all suffixes of the text repeated in other
documents of the collection. The R-measure can be effectively
computed using the suffix array data structure. Additionally, the
computation procedure can be improved to locate the sets of
duplicate or plagiarised documents. We applied the technique to
several standard text collections and found that they contained a
significant number of duplicate and plagiarised documents.
Another reformulation of the method leads to an algorithm that
can be applied to supervised multi-class categorization. We
illustrate the approach using the recently available Reuters
Corpus Volume 1 (RCV1). The results show that the method
outperforms SVM at multi-class categorization, and interestingly,
that results correlate strongly with compression-based methods.},
}
@inProceedings{Kim00,
author = {Yu-Hwan Kim and Shang-Yoon Hahn and Byoung-Tak Zhang},
title = {Text filtering by boosting naive {B}ayes classifiers},
booktitle = {Proceedings of SIGIR-00, 23rd ACM International Conference on
Research and Development in Information Retrieval},
editor = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong},
publisher = {{ACM} Press, New York, {US}},
address = {Athens, {GR}},
year = {2000},
pages = {168--75},
url = {http://www.acm.org/pubs/articles/proceedings/ir/345508/p168-kim/p168-kim.pdf},
abstract = {Several machine learning algorithms have recently been used for
text categorization and filtering. In particular, boosting
methods such as AdaBoost have shown good performance applied to
real text data. However, most of existing boosting algorithms are
based on classifiers that use binary-valued features. Thus, they
do not fully make use of the weight information provided by
standard term weighting methods. In this paper, we present a
boosting-based learning method for text filtering that uses naive
Bayes classifiers as a weak learner. The use of naive Bayes
allows the boosting algorithm to utilize term frequency
information while maintaining probabilistically accurate
confidence ratio. Applied to TREC-7 and TREC-8 filtering track
documents, the proposed method obtained a significant improvement
in LF1, LF2, Fl and F3 measures compared to the best results
submitted by other TREC entries.},
}
@inProceedings{Kindermann01,
author = {J{\"{o}}rg Kindermann and Gerhard Paa{{\ss}} and Edda Leopold},
title = {Error Correcting Codes with Optimized {K}ullback-{L}eibler
Distances for Text Categorization},
booktitle = {Proceedings of ECML-01, 12th European Conference on Machine
Learning},
editor = {Luc De Raedt and Arno Siebes},
publisher = {Springer Verlag, Heidelberg, {DE}},
address = {Freiburg, {DE}},
year = {2001},
pages = {266--275},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2168},
url = {http://link.springer.de/link/service/series/0558/papers/2168/21680266.pdf},
abstract = {We extend a multi-class categorization scheme proposed by
Dietterich and Bakiri 1995 for binary classifiers, using error
correcting codes. The extension comprises the computation of the
codes by a simulated annealing algorithm and optimization of
Kullback-Leibler (KL) category distances within the code-words.
For the first time, we apply the scheme to text categorization
with support vector machines (SVMs) on several large text corpora
with more than 100 categories. The results are compared to 1-of-N
coding (i.e.\ one SVM for each text category). We also
investigate codes with optimized KL distance between the text
categories which are merged in the code-words. We find that error
correcting codes perform better than 1-of-N coding with
increasing code length. For very long codes, the performance is
in some cases further improved by KL-distance optimization.},
}
@inProceedings{Klas00,
author = {Klas, Claus-Peter and Fuhr, Norbert},
title = {A new Effective Approach for Categorizing {W}eb Documents},
booktitle = {Proceedings of BCSIRSG-00, the 22nd Annual Colloquium of the
British Computer Society Information Retrieval Specialist Group},
editor = {},
address = {Cambridge, {UK}},
year = {2000},
pages = {},
publisher = {},
url = {http://ls6-www.informatik.uni-dortmund.de/bib/fulltext/ir/Klas_Fuhr:00.ps.gz},
abstract = {Categorization of Web documents poses a new challenge for
automatic classification methods. In this paper, we present the
megadocument approach for categorization. For each category, all
corresponding document texts from the training sample are
concatenated to a megadocument, which is indexed using standard
methods. In order to classify a new document, the most similar
megadocument determines the category to be assigned. Our
evaluations show that for Web collections, the megadocument
method clearly outperformes other classification methods. In
contrast, for the Reuters collection, we only achieve mediocre
results. Thus, our method seems to be well suited for
heterogeneous document collections.},
}
@article{Klingbiel73,
author = {Paul H. Klingbiel},
title = {Machine-aided indexing of technical literature},
journal = {Information Storage and Retrieval},
year = {1973},
volume = {9},
number = {2},
pages = {79--84},
url = {},
abstract = {To index successfully in the Defense Documentation Center's
environment, an automated system must chose single words or
phrases (dependent upon context) rapidly and economically. The
automation of DDC's indexing has been machine-aided from its
inception. A machine-aided indexing (MAI) system is described
that indexes one million words of text per hour of CPU time.
Grammatical errors do not exceed five per cent of the output, so
human screening is satisfactorily low. The system could
potentially scale up to an operational size of 10 million words
of text per YEAR - the equivalent of a dozen bibles or a third of
the Encyclopedia Britannica. In a batch mode, the programs to
accomplish this indexing would require no more than fifteen
minutes of CPU time per week.},
}
@article{Klingbiel73a,
author = {Paul H. Klingbiel},
title = {A technique for machine-aided indexing},
journal = {Information Storage and Retrieval},
year = {1973},
volume = {9},
number = {9},
pages = {477--494},
url = {},
abstract = {Subject indexing of text can, in principle, be accomplished in
many ways. The technique for machine-aided indexing (MAI)
developed at the Defense Documentation Center (DDC) is
illustrated on a randomly chosen abstract. Additional text is
provided in coded form so that the reader can more fully explore
this technique and form his own opinion of the applicability and
versatility of this particular procedure. The DDC method for
subject indexing is very close to operational status for a data
base which grows at the rate of two million words of text per
YEAR.},
}
@inProceedings{Klinkenberg00,
author = {Ralf Klinkenberg and Thorsten Joachims},
title = {Detecting concept drift with support vector machines},
booktitle = {Proceedings of ICML-00, 17th International Conference on Machine
Learning},
editor = {Pat Langley},
year = {2000},
address = {Stanford, {US}},
pages = {487--494},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {http://www-ai.cs.uni-dortmund.de/DOKUMENTE/klinkenberg_joachims_2000a.pdf.gz},
abstract = {For many learning tasks where data is collected over an extended
period of time, its underlying distribution is likely to change.
A typical example is information filtering, i.e. the adaptive
classification of documents with respect to a particular user
interest. Both the interest of the user and the document content
change over time. A filtering system should be able to adapt to
such concept changes. This paper proposes a new method to
recognize and handle concept changes with support vector
machines. The method maintains a window on the training data. The
key idea is to automatically adjust the window size so that the
estimated generalization error is minimized. The new approach is
both theoretically well-founded as well as effective and
efficient in practice. Since it does not require complicated
parameterization, it is simpler to use and more robust than
comparable heuristics. Experiments with simulated concept drift
scenarios based on real-world text data compare the new method
with other window management approaches. We show that it can
effectively select an appropriate window size in a robust way.},
}
@inProceedings{Knorz82,
author = {Knorz, Gerhard},
title = {A decision theory approach to optimal automated indexing},
booktitle = {Proceedings of SIGIR-82, 5th ACM International Conference on
Research and Development in Information Retrieval},
year = {1982},
editor = {Gerard Salton and Hans-Jochen Schneider},
pages = {174--193},
address = {Berlin, {DE}},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 146},
url = {},
abstract = {},
}
@inProceedings{Ko00,
author = {Youngjoong Ko and Jungyun Seo},
title = {Automatic Text Categorization by Unsupervised Learning},
booktitle = {Proceedings of COLING-00, the 18th International Conference on
Computational Linguistics},
year = {2000},
editor = {},
pages = {},
address = {Saarbr{\"{u}}cken, {DE}},
url = {http://nlp3.korea.ac.kr/proceeding/coling2000/COLING/ps/066.ps},
abstract = {The goal of text categorization is to classify documents into a
certain number of pre-defined categories. The previous works in
this area have used a large number of labeled training documents
for supervised learning. One problem is that it is difficult to
create the labeled training documents. While it is easy to
collect the unlabeled documents, it is not so easy to manually
categorize them for creating training documents. In this paper,
we propose an unsupervised learning method to overcome these
difficulties. The proposed method divides the documents into
sentences, and categorizes each sentence using keyword lists of
each category and sentence similarity measure. And then, it uses
the categorized sentences for training. The proposed method shows
a similar degree of performance, compared with the traditional
supervised learning methods. Therefore, this method can be used
in areas where low-cost text categorization is needed. It also
can be used for creating training documents.},
}
@inProceedings{Ko02,
author = {Youngjoong Ko and Jinwoo Park and Jungyun Seo},
title = {Automatic Text Categorization using the Importance of Sentences},
booktitle = {Proceedings of COLING-02, the 19th International Conference on
Computational Linguistics},
year = {2002},
editor = {},
pages = {},
address = {Taipei, {TW}},
url = {http://acl.ldc.upenn.edu/coling2002/proceedings/data/area-28/co-201.pdf},
abstract = {This paper proposes a new approach for text categorization, based
on a feature projection technique. In our approach, training data
are represented as the projections of training documents on each
feature. The voting for a classification is processed on the
basis of individual feature projections. The final classification
of test documents is determined by a majority voting from the
individual classifications of each feature. Our empirical results
show that the proposed approach, Text Categorization using
Feature Projections (TCFP), outperforms k-NN, Rocchio, and Naïve
Bayes. Most of all, TCFP is about one hundred times faster than
k-NN. Since TCFP algorithm is very simple, its implementation and
training process can be done very easily. For these reasons, TCFP
can be a useful classifier in the areas, which need a fast and
high-performance text categorization task.},
}
@inProceedings{Ko02a,
author = {Youngjoong Ko and Jungyun Seo},
title = {Text Categorization using Feature Projections},
booktitle = {Proceedings of COLING-02, the 19th International Conference on
Computational Linguistics},
year = {2002},
editor = {},
pages = {},
address = {Taipei, {TW}},
url = {http://acl.ldc.upenn.edu/coling2002/proceedings/data/area-28/co-269.pdf},
abstract = {Automatic text categorization is a problem of automatically
assigning text documents to predefined categories. In order to
classify text documents, we must extract good features from them.
In previous research, a text document is commonly represented by
the term frequency and the inverted document frequency of each
feature. Since there is a difference between important sentences
and unimportant sentences in a document, the features from more
important sentences should be considered more than other
features. In this paper, we measure the importance of sentences
using text summarization techniques. Then a document is
represented as a vector of features with different weights
according to the importance of each sentence. To verify our new
method, we conducted experiments on two language newsgroup data
sets: one written by English and the other written by Korean.
Four kinds of classifiers were used in our experiments: Naïve
Bayes, Rocchio, k-NN, and SVM. We observed that our new method
made a significant improvement in all classifiers and both data
sets.},
}
@inProceedings{Koehn02,
author = {Philipp Koehn},
title = {Combining Multiclass Maximum Entropy Text Classifiers with Neural
Network Voting},
booktitle = {Proceedings of PorTAL-02, 3rd International Conference on
Advances in Natural Language Processing},
year = {2002},
editor = {Elisabete Ranchod and Nuno J. Mamede},
pages = {125--132},
address = {Faro, {PT}},
url = {http://link.springer.de/link/service/series/0558/papers/2389/23890125.pdf},
abstract = {We improve a high-accuracy maximum entropy classifier by
combining an ensemble of classifiers with neural network voting.
In our experiments we demonstrate significantly superior
performance both over a single classifier as well as over the use
of the traditional weighted-sum voting approach. Specifically, we
apply this to a maximum entropy classifier on a large scale
multi-class text categorization task: the online job directory
Flipdog with over half a million jobs in 65 categories.},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2389},
}
@inProceedings{Kolcz01,
author = {Aleksander Kolcz and Vidya Prabakarmurthi and Jugal K. Kalita},
title = {String Match and Text Extraction: Summarization as feature
selection for text categorization},
booktitle = {Proceedings of CIKM-01, 10th ACM International Conference on
Information and Knowledge Management},
publisher = {{ACM} Press, New York, {US}},
editor = {Henrique Paques and Ling Liu and David Grossman},
year = {2001},
address = {Atlanta, {US}},
pages = {365--370},
url = {http://doi.acm.org/10.1145/502585.502647},
abstract = {We address the problem of evaluating the effectiveness of
summarization techniques for the task of document categorization.
It is argued that for a large class of automatic categorization
algorithms, extraction-based document categorization can be
viewed as a particular form of feature selection performed on the
full text of the document and, in this context, its impact can be
compared with state-of-the-art feature selection techniques
especially devised to provide good categorization performance.
Such a framework provides for a better assessment of the expected
performance of a categorizer if the compression rate of the
summarizer is known.},
}
@inProceedings{Koller97,
author = {Daphne Koller and Mehran Sahami},
title = {Hierarchically classifying documents using very few words},
booktitle = {Proceedings of ICML-97, 14th International Conference on Machine
Learning},
editor = {Douglas H. Fisher},
year = {1997},
address = {Nashville, {US}},
pages = {170--178},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {http://robotics.stanford.edu/users/sahami/papers-dir/ml97-hier.ps},
abstract = {The proliferation of topic hierarchies for text documents has
resulted in a need for tools that automatically classify new
documents within such hierarchies. Existing classification
schemes which ignore the hierarchical structure and treat the
topics as separate classes are often inadequate in text
classification where the there is a large number of classes and a
huge number of relevant features needed to distinguish between
them. We propose an approach that utilizes the hierarchical topic
structure to decompose the classification task into a set of
simpler problems, one at each node in the classification tree. As
we show, each of these smaller problems can be solved accurately
by focusing only on a very small set of features, those relevant
to the task at hand. This set of relevant features varies widely
throughout the hierarchy, so that, while the overall relevant
feature set may be large, each classifier only examines a small
subset. The use of reduced feature sets allows us to utilize more
complex (probabilistic) models, without encountering many of the
standard computational and robustness difficulties.},
}
@inProceedings{Kongovi02,
author = {Madhusudhan Kongovi and Juan Carlos Guzman and Venu Dasigi},
title = {Text Categorization: An experiment using Phrases},
booktitle = {Proceedings of ECIR-02, 24th European Colloquium on Information
Retrieval Research},
editor = {Fabio Crestani and Mark Girolami and Cornelis J. Van Rijsbergen},
year = {2002},
address = {Glasgow, {UK}},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2291},
pages = {213--228},
url = {http://link.springer.de/link/service/series/0558/papers/2291/22910213.pdf},
abstract = {Typical text classifiers learn from example and training
documents that have been manually categorized. In this research,
our experiment dealt with the classification of news wire
articles using category profiles. We built these profiles by
selecting feature words and phrases from the training documents.
For our experiments we decided on using the text corpus
Reuters-21578. We used precision and recall to measure the
effectiveness of our classifier. Though our experiments with
words yielded good results, we found instances where the
phrase-based approach produced more effectiveness. This could be
due to the fact that when a word along with its adjoining word -
a phrase - is considered towards building a category profile, it
could be a good discriminator. This tight packaging of word pairs
could bring in some semantic value. The packing of word pairs
also filters out words occurring frequently in isolation that do
not bear much weight towards characterizing that category.},
}
@article{Koppel02,
author = {Koppel, Moshe and Argamon, Shlomo and Shimoni, Anat R.},
title = {Automatically categorizing written texts by author gender},
journal = {Literary and Linguistic Computing},
year = {2002},
number = {4},
volume = {17},
pages = {401--412},
url = {http://www3.oup.co.uk/litlin/hdb/Volume_17/Issue_04/pdf/170401.pdf
},
abstract = {The problem of automatically determining the gender of a
document's author would appear to be a more subtle problem than
those of categorization by topic or authorship attribution.
Nevertheless, it is shown that automated text categorization
techniques can exploit combinations of simple lexical and
syntactic features to infer the gender of the author of an unseen
formal written document with approximately 80 per cent accuracy.
The same techniques can be used to determine if a document is
fiction or non-fiction with approximately 98 per cent accuracy.},
}
@inProceedings{Kosmynin96,
author = {Arkadi Kosmynin and Ian Davidson},
title = {Using background contextual knowledge for documents
representation},
booktitle = {Proceedings of PODP-96, 3rd International Workshop on Principles
of Document Processing},
editor = {Charles K. Nicholas and Derick Wood},
year = {1996},
address = {Palo Alto, {CA}},
pages = {123--133},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 1293},
url = {},
abstract = {We describe our approach to document representation that captures
contextual dependencies between terms in a corpus and makes use
of these dependencies to represent documents. We have tried our
representation scheme for automatic document categorisation on
the Reuters' test set of documents. We achieve a precision recall
break even point of 84\% which is comparable to the best known
published results. Our approach acts as a feature selection
technique that is an alternative to applying the techniques from
machine learning and numerical taxonomy.},
}
@article{Krier02,
author = {Marc Krier and Francesco Zacc{\`a}},
title = {Automatic categorization applications at the {E}uropean {P}atent
{O}ffice},
journal = {World Patent Information},
year = {2002},
volume = {24},
number = {},
pages = {187--196},
url = {},
abstract = {},
}
@inProceedings{Krishnapuram03,
author = {Raghu Krishnapuram and Krishna Chitrapura and Sachindra Joshi},
title = {Classification of Text Documents Based on Minimum System Entropy},
booktitle = {Proceedings of ICML-03, 20th International Conference on Machine
Learning},
editor = {},
year = {2003},
address = {Washington, {DC}},
pages = {},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {},
abstract = {},
}
@inProceedings{Kwok98,
author = {James T. Kwok},
title = {Automated text categorization using support vector machine},
booktitle = {Proceedings of ICONIP'98, 5th International Conference on Neural
Information Processing},
editor = {},
year = {1998},
address = {Kitakyushu, {JP}},
pages = {347--351},
url = {http://www.comp.hkbu.edu.hk/7Ejamesk/papers/iconip98.ps.gz},
abstract = {In this paper, we study the use of support vector machine in text
categorization. Unlike other machine learning techniques, it
allows easy incorporation of new documents into an existing
trained system. Moreover, dimension reduction, which is usually
imperative, now becomes optional. Thus, SVM adapts efficiently in
dynamic environments that require frequent additions to the
document collection. Empirical results on the Reuters-22173
collection are also discussed.},
}
@inProceedings{Kwon99,
author = {Oh-Woog Kwon and Sung-Hwa Jung and Jong-Hyeok Lee and Geunbae Lee},
title = {Evaluation of Category Features and Text Structural Information
on a Text Categorization Using Memory Based Reasoning},
booktitle = {Proceedings of ICCPOL-99, 18th International Conference on
Computer Processing of Oriental Languages},
editor = {},
year = {1999},
address = {Tokushima, {JP}},
pages = {153--158},
url = {},
abstract = {},
}
@article{Kwon03,
author = {Oh-Woog Kwon and Jong-Hyeok Lee},
title = {Text categorization based on {k}-nearest neighbor approach for
{W}eb site classification},
journal = {Information Processing and Management},
year = {2003},
volume = {39},
number = {1},
pages = {25--44},
url = {},
abstract = {},
}
@inProceedings{Labrou99,
author = {Yannis Labrou and Tim Finin},
title = {{{\sc Yahoo!}} as an ontology: using {{\sc Yahoo!}}\ categories
to describe documents},
booktitle = {Proceedings of CIKM-99, 8th ACM International Conference on
Information and Knowledge Management},
publisher = {{ACM} Press, New York, {US}},
editor = {},
year = {1999},
address = {Kansas City, {US}},
pages = {180--187},
url = {http://www.acm.org/pubs/articles/proceedings/cikm/319950/p180-labrou/p180-labrou.pdf},
abstract = {We suggest that one (or a collection) of names of {{\sc Yahoo!}}\
(or any other WWW indexer¹s) categories can be used to describe
the content of a document. Such categories offer a standardized
and universal way for referring to or describing the nature of
real world objects, activities, documents and so on, and may be
used (we suggest) to semantically characterize the content of
documents. WWW indices, like {{\sc Yahoo!}}\ provide a huge
hierarchy of categories (topics) that touch every aspect of human
endeavors. Such topics can be used as descriptors, similarly to
the way librarians use for example, the Library of Congress
cataloging system to annotate and categorize books. In the course
of investigating this idea, we address the problem of automatic
categorization of webpages in the {{\sc Yahoo!}}\ directory. We
use Telltale as our classifier; Telltale uses n-grams to compute
the similarity between documents. We experiment with various
types of descriptions for the {{\sc Yahoo!}}\ categories and the
webpages to be categorized. Our findings suggest that the best
results occur when using the very brief descriptions of the {{\sc
Yahoo!}}\ categorized entries; these brief descriptions are
provided either by the entries¹ submitters or by the {{\sc
Yahoo!}}\ human indexers and accompany most {{\sc
Yahoo!}}\-indexed entries.},
}
@inProceedings{Lai01,
author = {Kwok-Yin Lai and Wai Lam},
title = {Meta-learning Models for Automatic Textual Document
Categorization},
booktitle = {Proceedings of PAKDD-01, 5th Pacific-Asia Conferenece on
Knowledge Discovery and Data Mining},
editor = {David Cheung and Qing Li and Graham Williams},
year = {2001},
publisher = {Springer Verlag, Heidelberg, {DE}},
address = {Hong Kong, {CN}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2035},
pages = {78--89},
url = {http://link.springer.de/link/service/series/0558/papers/2035/20350078.pdf},
abstract = {We investigate two meta-model approaches for the task of
automatic textual document categorization. The first approach is
the linear combination approach. Based on the idea of distilling
the characteristics of how we estimate the merits of each
component algorithm, we propose three different strategies for
the linear combination approach. The linear combination approach
makes use of limited knowledge in the training document set. To
address this limitation, we propose the second meta-model
approach, called Meta-learning Using Document Feature
characteristics (MUDOF), which employs a meta-learning phase
using document feature characteristics. Document feature
characteristics, derived from the training document set, capture
some inherent properties of a particular category. Extensive
experiments have been conducted on a real-world document
collection and satisfactory performance is obtained.},
}
@article{Lai02,
author = {Yu-Sheng Lai and Chung-Hsien Wu},
title = {COLUMN: Meaningful term extraction and discriminative term
selection in text categorization via unknown-word methodology},
journal = {{ACM} Transactions on Asian Language Information Processing},
volume = {1},
number = {1},
pages = {34--64},
year = {2002},
url = {http://doi.acm.org/10.1145/509900.509904},
abstract = {In this article, an approach based on unknown words is proposed
for meaningful term extraction and discriminative term selection
in text categorization. For meaningful term extraction, a
phrase-like unit (PLU)-based likelihood ratio is proposed to
estimate the likelihood that a word sequence is an unknown word.
On the other hand, a discriminative measure is proposed for term
selection and is combined with the PLU-based likelihood ratio to
determine the text category. We conducted several experiments on
a news corpus, called MSDN. The MSDN corpus is collected from an
online news Website maintained by the Min-Sheng Daily News,
Taiwan. The corpus contains 44,675 articles with over 35 million
words. The experimental results show that the system using a
simple classifier achieved 95.31\% accuracy. When using a
state-of-the-art classifier, kNN, the average accuracy is
96.40\%, outperforming all the other systems evaluated on the
same collection, including the traditional term-word by kNN
(88.52\%); sleeping-experts (82.22\%); sparse phrase by four-word
sleeping-experts (86.34\%); and Boolean combinations of words by
RIPPER (87.54\%). A proposed purification process can effectively
reduce the dimensionality of the feature space from 50,576 terms
in the word-based approach to 19,865 terms in the unknown
word-based approach. In addition, more than 80\% of automatically
extracted terms are meaningful. Experiments also show that the
proportion of meaningful terms extracted from training data is
relative to the classification accuracy in outside testing.},
}
@inProceedings{Lam99,
author = {Savio L. Lam and Dik L. Lee},
title = {Feature Reduction for Neural Network Based Text Categorization},
booktitle = {Proceedings of DASFAA-99, 6th IEEE International Conference on
Database Advanced Systems for Advanced Application},
editor = {Arbee L. Chen and Frederick H. Lochovsky},
publisher = {{IEEE} Computer Society Press, Los Alamitos, {US}},
year = {1999},
address = {Hsinchu, {TW}},
pages = {195--202},
url = {http://dlib.computer.org/conferen/dasfaa/0084/pdf/00840195.pdf},
abstract = {In a text categorization model using an artificial neural network
as the text classifier scalability is poor if the neural network
is trained using the raw feature space since textural data has a
very high-dimension feature space. We proposed and compared four
dimensionality reduction techniques to reduce the feature space
into an input space of much lower dimension for the neural
network classifier. To test the effectiveness of the proposed
model, experiments were conducted using a subset of the
Reuters-22173 test collection for text categorization. The
results showed that the proposed model was able to achieve high
categorization effectiveness as measured by precision and recall.
Among the four dimensionality reduction techniques proposed,
principal component analysis was found to be the most effective
in reducing the dimensionality of the feature space.},
}
@inProceedings{Lam97,
author = {Wai Lam and Kon F. Low and Chao Y. Ho},
title = {Using a {B}ayesian Network Induction Approach for Text
Categorization},
booktitle = {Proceedings of IJCAI-97, 15th International Joint Conference on
Artificial Intelligence},
editor = {Martha E. Pollack},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
year = {1997},
address = {Nagoya, {JP}},
pages = {745--750},
url = {},
abstract = {We investigate Bayesian methods for automatic document
categorization and develop a new approach to this problem. Our
new approach is based on a Bayesian network induction which does
not rely on some major assumptions found in a previous method
using the Bayesian independence classifier approach. The design
of the new approach as well as its justification are presented.
Experiments were conducted using a large scale document
collection from Reuters news articles. The results show that our
approach outperformed the Bayesian independence classifier as
measured by a metric that combines precision and recall measures.},
}
@inProceedings{Lam98,
author = {Wai Lam and Chao Y. Ho},
title = {Using a generalized instance set for automatic text
categorization},
booktitle = {Proceedings of SIGIR-98, 21st ACM International Conference on
Research and Development in Information Retrieval},
editor = {W. Bruce Croft and Alistair Moffat and Cornelis J. Van Rijsbergen
and Ross Wilkinson and Justin Zobel},
publisher = {{ACM} Press, New York, {US}},
year = {1998},
address = {Melbourne, {AU}},
pages = {81--89},
url = {http://www.acm.org/pubs/articles/proceedings/ir/290941/p81-lam/p81-lam.pdf},
abstract = {We investigate several recent approaches for text categorization
under the framework of similarity-based learning. They include
two families of text categorization techniques, namely the
k-nearest neighbor (k-NN) algorithm and linear classifiers. After
identifying the weakness and strength of each technique, we
propose a new technique known as the generalized instance set
(GIS) algorithm by unifying the strengths of LNN and linear
classifiers and adapting to characteristics of text
categorization problems. We also explore some variants of our GIS
approach. We have implemented our GIS algorithm, the ExpNet
algorithm, and some linear classifiers. Extensive experiments
have been conducted on two common document corpora, namely the
OHSUMED collection and the Reuters-21578 collection. The results
show that our new approach outperforms the latest LNN approach
and linear classifiers in all experiments.},
}
@article{Lam99a,
author = {Lam, Wai and Ruiz, Miguel E. and Srinivasan, Padmini},
title = {Automatic text categorization and its applications to text
retrieval},
journal = {{IEEE} Transactions on Knowledge and Data Engineering},
year = {1999},
number = {6},
volume = {11},
pages = {865--879},
url = {http://www.cs.uiowa.edu/~mruiz/papers/IEEE-TKDE.ps},
abstract = {We develop an automatic text categorization approach and
investigate its application to text retrieval. The categorization
approach is derived from a combination of a learning paradigm
known as instance-based learning and an advanced document
retrieval technique known as retrieval feedback. We demonstrate
the effectiveness of our categorization approach using two
real-world document collections from the MEDLINE database. Next,
we investigate the application of automatic categorization to
text retrieval. Our experiments clearly indicate that automatic
categorization improves the retrieval performance compared with
no categorization. We also demonstrate that the retrieval
performance using automatic categorization achieves the same
retrieval quality as the performance using manual categorization.
Furthermore, detailed analysis of the retrieval performance on
each individual test query is provided.},
}
@inProceedings{Lam01,
author = {Wai Lam and Kwok-Yin Lai},
title = {A Meta-Learning Approach for Text Categorization},
booktitle = {Proceedings of SIGIR-01, 24th ACM International Conference on
Research and Development in Information Retrieval},
editor = {W. Bruce Croft and David J. Harper and Donald H. Kraft and Justin
Zobel},
publisher = {{ACM} Press, New York, {US}},
address = {New Orleans, {US}},
year = {2001},
pages = {303--309},
url = {http://portal.acm.org/citation.cfm?doid=383952.384011},
abstract = {We investigate a meta-model approach, called Meta-learning Using
Document Feature characteristics (MUDOF), for the task of
automatic textual document categorization. It employs a
meta-learning phase using document feature characteristics.
Document feature characteristics, derived from the training
document set, capture some inherent category-specific properties
of a particular category. Different from existing categorization
methods, MUDOF can automatically recommend a suitable algorithm
for each category based on the category-specific statistical
characteristics. Hence, different algorithms may be employed for
different categories. Experiments have been conducted on a
real-world document collection demonstrating the effectiveness of
our approach. The results confirm that our meta-model approach
can exploit the advantage of its component algorithms, and
demonstrate a better performance than existing algorithms.},
}
@inProceedings{Lang95,
author = {Ken Lang},
title = {{\sc NewsWeeder}: learning to filter netnews},
booktitle = {Proceedings of ICML-95, 12th International Conference on Machine
Learning},
editor = {Armand Prieditis and Stuart J. Russell},
address = {Lake Tahoe, {US}},
pages = {331--339},
year = {1995},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {},
abstract = {},
}
@inProceedings{Lanquillon00,
author = {Carsten Lanquillon},
title = {Learning from Labeled and Unlabeled Documents: A Comparative
Study on Semi-Supervised Text Classification},
booktitle = {Proceedings of PKDD-00, 4th European Conference on Principles of
Data Mining and Knowledge Discovery},
editor = {Djamel A. Zighed and Henryk Jan Komorowski and Jan M. Zytkow},
address = {Lyon, {FR}},
pages = {490--497},
year = {2000},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 1910},
url = {http://link.springer.de/link/service/series/0558/papers/1910/19100490.pdf},
abstract = {Supervised learning algorithms usually require large amounts of
training data to learn reasonably accurate classifiers. Yet, for
many text classification tasks, providing labeled training
documents is expensive, while unlabeled documents are readily
available in large quantities. Learning from both, labeled and
unlabeled documents, in a semi-supervised framework is a
promising approach to reduce the need for labeled training
documents. This paper compares three commonly applied text
classifiers in the light of semi-supervised learning, namely a
linear support vector machine, a similarity-based tfidf and a
Naïve Bayes classifier. Results on a real-world text datasets
show that these learners may substantially benefit from using a
large amount of unlabeled documents in addition to some labeled
documents.},
}
@inProceedings{Larkey96,
author = {Leah S. Larkey and W. Bruce Croft},
title = {Combining classifiers in text categorization},
booktitle = {Proceedings of SIGIR-96, 19th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Hans-Peter Frei and Donna Harman and Peter Sch{\"{a}}uble and
Ross Wilkinson},
publisher = {{ACM} Press, New York, {US}},
year = {1996},
address = {Z{\"{u}}rich, {CH}},
pages = {289--297},
url = {http://cobar.cs.umass.edu/pubfiles/1combo.ps.gz},
abstract = {Three different types of classifiers were investigated in the
context of a text categorization problem in the medical domain:
the automatic assignment of ICD9 codes to dictated inpatient
discharge summaries. K-nearest-neighbour, relevance feedback, and
Bayesian independence classifiers were applied individually and
in combination. A combination of different classifiers produced
better results than any single type of classifier. For this
specific medical categorization problem, new query formulation
and weighting methods used in the k-nearest-neighbor classifier
improved performance.},
}
@inProceedings{Larkey98,
author = {Leah S. Larkey},
title = {Automatic essay grading using text categorization techniques},
booktitle = {Proceedings of SIGIR-98, 21st ACM International Conference on
Research and Development in Information Retrieval},
editor = {W. Bruce Croft and Alistair Moffat and Cornelis J. Van Rijsbergen
and Ross Wilkinson and Justin Zobel},
publisher = {{ACM} Press, New York, {US}},
year = {1998},
address = {Melbourne, {AU}},
pages = {90--95},
url = {http://cobar.cs.umass.edu/pubfiles/ir-121.ps},
abstract = {Several standard text-categorization techniques were applied to
the problem of automated essay grading. Bayesian independence
classifiers and k-nearest-neighbor classifiers were trained to
assign scores to manually-graded essays. These scores were
combined with several other summary text measures using linear
regression. The classifiers and regression equations were then
applied to a new set of essays. The classifiers worked very well.
The agreement between the automated grader and the final manual
grade was as good as the agreement between human graders.},
}
@inProceedings{Larkey99,
author = {Leah S. Larkey},
title = {A patent search and classification system},
booktitle = {Proceedings of DL-99, 4th ACM Conference on Digital Libraries},
editor = {Edward A. Fox and Neil Rowe},
publisher = {{ACM} Press, New York, {US}},
year = {1999},
address = {Berkeley, {US}},
pages = {179--187},
url = {http://cobar.cs.umass.edu/pubfiles/ir-162.ps},
abstract = {We present a system for searching and classifying U.S. patent
documents, based on Inquery. Patents are distributed through
hundreds of collections, divided up by general area. The system
selects the best collections for the query. Users can search for
pants or classify patent text. The user interface helps users
search in fields without requiring the knowledge of Inquery query
operators. The system includes a unique phrase help facility,
which helps users find and add phrases and terms related to those
in their query.},
}
@inProceedings{Lee02,
author = {Yong-Bae Lee and Sung H. Myaeng},
title = {Text Genre Classification with Genre-Revealing and
Subject-Revealing Features},
booktitle = {Proceedings of SIGIR-02, 25th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng
and Kalervo J{\"{a}}rvelin},
publisher = {{ACM} Press, New York, {US}},
address = {Tampere, {FI}},
year = {2002},
pages = {145--150},
url = {http://doi.acm.org/10.1145/564376.564403},
abstract = {Subject or prepositional content has been the focus of most
classification research. Genre or style, on the other hand, is a
different and important property of text, and automatic text
genre classification is becoming important for classification and
retrieval purposes as well as for some natural language
processing research. In this paper, we present a method for
automatic genre classification that is based on statistically
selected features obtained from both subject-classified and genre
classified training data. The experimental results show that the
proposed method outperforms a direct application of a statistical
learner often used for subject classification. We also observe
that the deviation formula and discrimination formula using
document frequency ratios also work as expected. We conjecture
that this dual feature set approach can be generalized to improve
the performance of subject classification as well.},
}
@inProceedings{Lee02a,
author = {Michael D. Lee},
title = {Fast Text Classification Using Sequential Sampling Processes},
booktitle = {Proceedings of the 14th Australian Joint Conference on Artificial
Intelligence},
editor = {Markus Stumptner and Dan Corbett and Michael J. Brooks},
publisher = {Springer Verlag, Heidelberg, {DE}},
address = {Adelaide, {AU}},
year = {2002},
pages = {309--320},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2256},
url = {http://link.springer.de/link/service/series/0558/papers/2256/22560309.pdf},
abstract = {A central problem in information retrieval is the automated
classification of text documents. While many existing methods
achieve good levels of performance, they generally require levels
of computation that prevent them from making sufficiently fast
decisions in some applied setting. Using insights gained from
examining the way humans make fast decisions when classifying
text documents, two new text classification algorithms are
developed based on sequential sampling processes. These
algorithms make extremely fast decisions, because they need to
examine only a small number of words in each text document.
Evaluation against the Reuters-21578 collection shows both
techniques have levels of performance that approach benchmark
methods, and the ability of one of the classifiers to produce
realistic measures of confidence in its decisions is shown to be
useful for prioritizing relevant documents.},
}
@inProceedings{Lee02c,
author = {Kang Hyuk Lee and Judy Kay and Byeong Ho Kang and Uwe Rosebrock},
title = {A Comparative Study on Statistical Machine Learning Algorithms
and Thresholding Strategies for Automatic Text Categorization},
booktitle = {Proceedings of PRICAI-02, 7th Pacific Rim International
Conference on Artificial Intelligence},
editor = {Mitsuru Ishizuka and Abdul Sattar},
publisher = {Springer Verlag, Heidelberg, {DE}},
address = {Tokyo, {JP}},
year = {2002},
pages = {444--453},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2417},
url = {http://link.springer.de/link/service/series/0558/papers/2417/24170444.pdf},
abstract = {Two main research areas in statistical text categorization are
similarity-based learning algorithms and associated thresholding
strategies. The combination of these techniques significantly
influences the overall performance of text categorization. After
investigating two similarity-based classifiers (k-NN and Rocchio)
and three common thresholding techniques (RCut, PCut, and SCut),
we describe a new learning algorithm known as the keyword
association network (KAN) and a new thresholding strategy
(RinSCut) to improve performance over existing techniques.
Extensive experiments have been conducted on the Reuters-21578
and 20-Newsgroups data sets. The experimental results show that
our new approaches give better results for both micro-averaged F1
and macro-averaged F1 scores.},
}
@article{Lehnert94,
author = {Wendy Lehnert and Stephen Soderland and David Aronow and Fangfang
Feng and Avinoam Shmueli},
title = {Inductive text classification for medical applications},
journal = {Journal of Experimental and Theoretical Artificial Intelligence},
year = {1994},
number = {1},
volume = {7},
pages = {49--80},
url = {},
abstract = {},
}
@article{Leopold02,
author = {Leopold, Edda and Kindermann, J{\"{o}}rg},
title = {Text Categorization with Support Vector Machines: How to
Represent Texts in Input Space?},
journal = {Machine Learning},
year = {2002},
volume = {46},
number = {1/3},
pages = {423--444},
url = {http://www.wkap.nl/article.pdf?380516},
abstract = {The choice of the kernel function is crucial to most applications
of support vector machines. In this paper, however, we show that
in the case of text classification, term-frequency
transformations have a larger impact on the performance of SVM
than the kernel itself. We discuss the role of importance-weights
(e.g. document frequency and redundancy), which is not yet fully
understood in the light of model complexity and calculation cost,
and we show that time consuming lemmatization or stemming can be
avoided even when classifying a highly inflectional language like
German.},
}
@article{Leung97,
author = {Chi-Hong Leung and Wing-Kay Kan},
title = {A Statistical Learning Approach to Automatic Indexing of
Controlled Index Terms},
journal = {Journal of the American Society for Information Science},
year = {1997},
number = {1},
pages = {55--67},
volume = {48},
url = {http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=39602&PLACEBO=IE.pdf},
abstract = {A statistical learning approach to assigning controlled index
terms is presented. In this approach, there are two processes:
(1) The learning process and (2) the indexing process. The
learning process constructs a relationship between an index term
and the words relevant and irrelevant to it, based on the
positive training set and negative training set, which are sample
documents indexed by the index term, and those not indexed by it,
respectively. The indexing process determines whether an index
term is assigned to a certain document, based on the relationship
constructed by the learning process, and the text found in the
document. Furthermore, a learning feedback technique is
introduced. This technique used in the learning process modifies
the relationship between an index term and its relevant and
irrelevant words to improve the learning performance and, thus,
the indexing performance. Experimental results have shown that
the statistical learning approach and the learning feedback
technique are practical means to automatic indexing of controlled
index terms.},
}
@inProceedings{Lewis91,
author = {Lewis, David D.},
title = {Data extraction as text categorization: An experiment with the
{MUC-3} corpus.},
booktitle = {Proceedings of MUC-3, 3rd Message Understanding Conference},
editor = {},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
address = {San Diego, {US}},
pages = {245--255},
year = {1991},
url = {http://www.research.att.com/~lewis/papers/lewis91c.ps},
abstract = {[no abstract]},
}
@inProceedings{Lewis92,
author = {Lewis, David D.},
title = {An evaluation of phrasal and clustered representations on a text
categorization task},
booktitle = {Proceedings of SIGIR-92, 15th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Nicholas J. Belkin and Peter Ingwersen and Annelise Mark
Pejtersen},
publisher = {{ACM} Press, New York, {US}},
address = {Kobenhavn, {DK}},
pages = {37--50},
year = {1992},
url = {http://www.research.att.com/~lewis/papers/lewis92b.ps},
abstract = {Syntactic phrase indexing and term clustering have been widely
explored as text representation techniques for text retrieval. In
this paper, we study the properties of phrasal and clustered
indexing languages on a text categorization task, enabling us to
study their properties in isolation from query interpretation
issues. We show that optimal effectiveness occurs when using only
a small proportion of the indexing terms available, and that
effectiveness peaks at a higher feature set size and lower
effectiveness level for a syntactic phrase indexing than for
word-based indexing. We also present results suggesting that
traditional term clustering methods are unlikely to provide
significantly improved text representations. An improved
probabilistic text categorization method is also presented.},
}
@phdThesis{Lewis92a,
author = {Lewis, David D.},
title = {Representation and learning in information retrieval},
school = {Department of Computer Science, University of Massachusetts},
address = {Amherst, {US}},
year = {1992},
url = {http://www.research.att.com/~lewis/papers/lewis91d.ps},
abstract = {This dissertation introduces a new theoretical model for text
classification systems, including systems for document retrieval,
automated indexing, electronic mail filtering, and similar tasks.
The Concept Learning model emphasizes the role manual and
automated feature selection and classifier formation in text
classification. It enables drawing on results from statistics and
machine learning in explaining the effectiveness of alternate
representations of text, and specifies desirable characteristics
of text representations. The use of syntactic parsing to produce
indexing phrases has been widely investigated as a possible route
to better text representations. Experiments with syntactic phrase
indexing, however, have never yielded significant improvements in
text retrieval performance. The Concept Learning model suggests
that the poor statistical characteristics of a syntactic indexing
phrase representation negate its dsirable semantic
characteristics. The application of term clustering to this
representation to improve its statistical properties while
retaining its desirable meaning properties is proposed. Standard
term clustering strategies from information retrieval (IR), based
on cooccurence of indexing terms in documents or groups of
documents, were tested on a syntactic indexing phrase
representation. In experiments using a standard text retrieval
test collection, small effectiveness improvements were obtained.
As a means of evaluating representation quality, a text retrieval
test collection introduces a number of confounding factors. In
contrast, the text categorization task allows much cleaner
determination of text representation properties. In preparation
for the use of text categorization to study text representation,
a more effective and theoretically well-founded probablistic text
categorization algorithm was developed, building on work by
Maron, Fuhr, and others. Text categorization experiments
supported a number of predictions of the Concept Learning model
about properties of phrasal representations, including
dimensionality properties not previously measured for text
representations. However, in carefully controlled experiments
using syntactic phrases produced by Church's stochastic
bracketer, in conjunction with reciprocal nearest neighbor
clustering, term clustering was found to produce essentially no
improvement in the properties of the phrasal representation. New
cluster analysis approaches are proposed to remedy the problems
found in traditional term clustering methods.},
}
@inProceedings{Lewis94c,
author = {Lewis, David D. and Jason Catlett},
title = {Heterogeneous uncertainty sampling for supervised learning},
booktitle = {Proceedings of ICML-94, 11th International Conference on Machine
Learning},
editor = {William W. Cohen and Haym Hirsh},
year = {1994},
address = {New Brunswick, {US}},
pages = {148--156},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {http://www.research.att.com/~lewis/papers/lewis94e.ps},
abstract = {Uncertainty sampling methods iteratively request class labels for
training instances whose classes are uncertain despite the
previous labeled instances. These methods can greatly reduce the
number of instances that an expert need label. One problem with
this approach is that the classifier best suited for an
application may be too expensive to train or use during the
selection of instances. We test the use of one classifier (a
highly efficient probabilistic one) to select examples for
training another (the C4.5 rule induction program). Despite being
chosen by this heterogeneous approach, the uncertainty samples
yielded classifiers with lower error rates than random samples
ten times larger.},
}
@inProceedings{Lewis94a,
author = {Lewis, David D. and Gale, William A.},
title = {A sequential algorithm for training text classifiers},
booktitle = {Proceedings of SIGIR-94, 17th ACM International Conference on
Research and Development in Information Retrieval},
editor = {W. Bruce Croft and Cornelis J. Van Rijsbergen},
publisher = {Springer Verlag, Heidelberg, {DE}},
year = {1994},
address = {Dublin, {IE}},
pages = {3--12},
note = {See also~\cite{Lewis95a}},
url = {http://www.research.att.com/~lewis/papers/lewis94c.ps},
abstract = {The ability to cheaply train text classifiers is critical to
their use in information retrieval, content analysis, natural
language processing, and other tasks involving data which is
partly or fully textual. An algorithm for sequential sampling
during machine learning of statistical classifiers was developed
and tested on a newswire text categorization task. This method,
which we call uncertainty sampling, reduced by as much as
500-fold the amount of training data that would have to be
manually classified to achieve a given level of effectiveness.},
}
@article{Lewis94b,
author = {Lewis, David D. and Philip J. Hayes},
title = {Guest editors' introduction to the special issue on text
categorization},
journal = {{ACM} Transactions on Information Systems},
volume = {12},
number = {3},
pages = {231},
year = {1994},
}
@inProceedings{Lewis94,
author = {Lewis, David D. and Marc Ringuette},
title = {A comparison of two learning algorithms for text categorization},
booktitle = {Proceedings of SDAIR-94, 3rd Annual Symposium on Document
Analysis and Information Retrieval},
publisher = {},
editor = {},
year = {1994},
address = {Las Vegas, {US}},
pages = {81--93},
url = {http://www.research.att.com/~lewis/papers/lewis94b.ps},
abstract = {This paper examines the use of inductive learning to categorize
natural language documents into predefined content categories.
Categorization of text is of increasing importance in information
retrieval and natural language processing systems. Previous
research on automated text categorization has mixed machine
learning and knowledge engineering methods, making it difficult
to draw conclusions about the performance of particular methods.
In this paper we present empirical results on the performance of
a Bayesian classifier and a decision tree learning algorithm on
two text categorization data sets. We find that both algorithms
achieve reasonable performance and allow controlled tradeoffs
between false positives and false negatives. The stepwise feature
selection in the decision tree algorithm is particularly
effective in dealing with the large feature sets common in text
categorization. However, even this algorithm is aided by an
initial prefiltering of features, confirming the results found by
Almuallim and Dietterich on artificial data sets. We also
demonstrate the impact of the time-varying nature of category
definitions.},
}
@inProceedings{Lewis95,
author = {Lewis, David D.},
title = {Evaluating and optmizing autonomous text classification systems},
booktitle = {Proceedings of SIGIR-95, 18th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Edward A. Fox and Peter Ingwersen and Raya Fidel},
publisher = {{ACM} Press, New York, {US}},
year = {1995},
address = {Seattle, {US}},
pages = {246--254},
url = {http://www.research.att.com/~lewis/papers/lewis95b.ps},
abstract = {Text retrieval systems typically produce a ranking of documents
and let a user decide how far down that ranking to go. In
contrast, programs that filter text streams, software that
categorizes documents, agents which alert users, and many other
IR systems must make decisions without human input or
supervision. It is important to define what constitutes good
effectiveness for these autonomous systems, tune the systems to
achieve the highest possible effectiveness, and estimate how the
effectiveness changes as new data is processed. We show how to do
this for binary text classification systems, emphasizing that
different goals for the system lead to different optimal
behaviors. Optimizing and estimating effectiveness is greatly
aided if classifiers that explicitly estimate the probability of
class membership are used.},
}
@article{Lewis95a,
author = {Lewis, David D.},
title = {A sequential algorithm for training text classifiers: corrigendum
and additional data},
journal = {{SIGIR} Forum},
year = {1995},
pages = {13--19},
volume = {29},
number = {2},
url = {http://www.research.att.com/~lewis/papers/lewis95g.ps},
abstract = {Previously I compared the effectiveness of uncertainty sampling
with that of random sampling and relevance sampling in choosing
training data for a text categorization data set (Lewis and Gale,
1994). (Relevance sampling is the application of relevance
feedback to producing a training sample.) I have discovered a bug
in my experimental software which caused the relevance sampling
results reported in the paper to be incorrect. (The uncertainty
sampling and random sampling results in that paper were correct.)
I have since fixed the bug and rerun the experiments. This note
presents the corrected results, along with additional data
supporting the original claim that uncertainty sampling has an
advantage over relevance sampling in most training situations.},
}
@inProceedings{Lewis95b,
author = {David D. Lewis},
title = {The {TREC-4} filtering track: description and analysis},
booktitle = {Proceedings of TREC-4, 4th Text Retrieval Conference},
publisher = {National Institute of Standards and Technology, Gaithersburg, {US}},
editor = {Donna K. Harman and Ellen M. Voorhees},
year = {1995},
address = {Gaithersburg, {US}},
pages = {165--180},
url = {http://www.research.att.com/~lewis/papers/lewis96b.ps},
abstract = {The TREC-4 (4th Text REtrieval Conference) filtering track was an
experiment in the evaluation of binary text classification
systems. In contrast to ranking systems, binary text
classification systems may need to produce result sets of any
size, requiring that sampling be used to estimate their
effectiveness. We present an effectiveness measure based on
utility, and two sampling strategies (pooling and stratified
sampling) for estimating the utility of the submitted sets. An
evaluation of four sites was successfully carried out using this
approach.},
}
@inProceedings{Lewis96,
author = {Lewis, David D. and Robert E. Schapire and James P. Callan and
Ron Papka},
title = {Training algorithms for linear text classifiers},
booktitle = {Proceedings of SIGIR-96, 19th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Hans-Peter Frei and Donna Harman and Peter Sch{\"{a}}uble and
Ross Wilkinson},
publisher = {{ACM} Press, New York, {US}},
year = {1996},
address = {Z{\"{u}}rich, {CH}},
pages = {298--306},
url = {http://www.research.att.com/~lewis/papers/lewis96d.ps},
abstract = {Systems for text retrieval, routing, categorization and other IR
tasks rely heavily on linear classifiers. We propose that two
machine learning algorithms, the Widrow-Hoff and EG algorithms,
be used in training linear text classifiers. In contrast to most
IR methods, theoretical analysis provides performance guarantees
and guidance on parameter settings for these algorithms.
Experimental data is presented showing Widrow-Hoff and EG to be
more effective than the widely used Rocchio algorithm on several
categorization and routing tasks.},
}
@misc{Lewis97a,
author = {Lewis, David D.},
title = {Reuters-21578 text categorization test collection. {D}istribution
1.0},
year = {1997},
note = {Available as {\tt
http://www.research.att.com/\~{}lewis/reuters21578/README.txt}},
url = {http://www.research.att.com/~lewis/reuters21578/README.txt},
abstract = {[no abstract]},
}
@inProceedings{Lewis98,
author = {Lewis, David D.},
title = {Naive ({B}ayes) at forty: The independence assumption in
information retrieval.},
booktitle = {Proceedings of ECML-98, 10th European Conference on Machine
Learning},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 1398},
editor = {Claire N{\'{e}}dellec and C{\'{e}}line Rouveirol},
address = {Chemnitz, {DE}},
pages = {4--15},
year = {1998},
url = {http://www.research.att.com/~lewis/papers/lewis98b.ps},
abstract = {The naive Bayes classifier, currently experiencing a renaissance
in machine learning, has long been a core technique in
information retrieval. We review some of the variations of naive
Bayes models used for text retrieval and classification, focusing
on the distributional assumptions made about word occurrences in
documents.},
}
@inProceedings{Lewis99,
author = {Lewis, David D. and Daniel L. Stern and Amit Singhal},
title = {{\sc Attics}: a software platform for on-line text classification},
booktitle = {Proceedings of SIGIR-99, 22nd ACM International Conference on
Research and Development in Information Retrieval},
editor = {Marti A. Hearst and Fredric Gey and Richard Tong},
publisher = {{ACM} Press, New York, {US}},
address = {Berkeley, {US}},
year = {1999},
pages = {267--268},
url = {http://www.acm.org/pubs/articles/proceedings/ir/312624/p267-lewis/p267-lewis.pdf},
abstract = {Numerous systems for ranked retrieval on text databases have been
implemented by both information retrieval researchers and in the
commercial sector. In contrast, software for text categorization,
message filtering, textual data mining, and related tasks is less
common. ATTICS is an extensible text classification system we
have implemented in C++. It supports incremental training and
online application of classifiers and predictive models to
streams of textual, numeric, symbolic, and hybrid data records.
An object-oriented design allows easy addition of new
preprocessors, machine learning algorithms, and classifier types.},
}
@inProceedings{Lewis00,
author = {Lewis, David D.},
title = {Machine learning for text categorization: background and
characteristics},
booktitle = {Proceedings of the 21st Annual National Online Meeting},
editor = {Williams, Martha E.},
publisher = {Information Today, Medford, {USA}},
address = {New York, {US}},
year = {2000},
pages = {221--226},
url = {},
abstract = {Text categorization is of increasing interest in both controlled
vocabulary indexing and other applications. Machine learning
methods for automatically producing categorization rules have
similarly seen increased attention, as a way to reduce the cost
of fielding categorization systems. While the experimental
literature on text categorization emphasizes effectiveness
comparisons, we list a variety of other characteristics of
learning approaches that are equally important to consider.
Research on machine learning for text categorization, already
advancing at a rapid pace, could be further accelerated if better
test collections were available.},
}
@article{Lewis03,
author = {Lewis, David D. and Fan Li and Tony Rose and Yiming Yang},
title = {{Reuters Corpus Volume I} as a Text categorization test
collection},
journal = {Journal of Machine Learning Research},
volume = {},
month = {},
pages = {},
year = {2003},
url = {},
abstract = {},
note = {Forthcoming},
}
@inProceedings{Li97,
author = {Hang Li and Kenji Yamanishi},
title = {Document classification using a finite mixture model},
booktitle = {Proceedings of ACL-97, 35th Annual Meeting of the Association for
Computational Linguistics},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
editor = {Philip R. Cohen and Wolfgang Wahlster},
year = {1997},
address = {Madrid, {ES}},
pages = {39--47},
url = {http://xxx.lanl.gov/ps/cmp-lg/9705005},
abstract = {We propose a new method of classifying documents into categories.
The simple method of conducting hypothesis testing over
word-based distributions in categories suffers from the data
sparseness problem. In order to address this difficulty, Guthrie
et.al. have developed a method using distributions based on hard
clustering of words, i.e., in which a word is assigned to a
single cluster and words in the same cluster are treated
uniformly. This method might, however, degrade classification
results, since the distributions it employs are not always
precise enough for representing the differences between
categories. We propose here the use of soft clustering of words,
i.e., in which a word can be assigned to several different
clusters and each cluster is characterized by a specific word
probability distribution. We define for each document category a
finite mixture model, which is a linear combination of the
probability distributions of the clusters. We thereby treat the
problem of classifying documents as that of conducting
statistical hypothesis testing over finite mixture models. In
order to accomplish this testing, we employ the EM algorithm
which helps efficiently estimate parameters in a finite mixture
model. Experimental results indicate that our method outperforms
not only the method using distributions based on hard clustering,
but also the method using word-based distributions and the method
based on cosine-similarity.},
}
@inProceedings{Li99,
author = {Hang Li and Kenji Yamanishi},
title = {Text classification using {ESC}-based stochastic decision lists},
booktitle = {Proceedings of CIKM-99, 8th ACM International Conference on
Information and Knowledge Management},
publisher = {{ACM} Press, New York, {US}},
editor = {},
year = {1999},
address = {Kansas City, {US}},
pages = {122--130},
url = {http://www.acm.org/pubs/articles/proceedings/cikm/319950/p122-li/p122-li.pdf},
abstract = {We propose a new method of text classification using stochastic
decision lists. A stochastic decision list is an ordered sequence
of IF-THEN rules, and our method can be viewed as a rule-based
method for text clsssification having advantages of readability
and refinability of acquired knowledge. Our method is unique in
that decision lists are automatically constructed on the basis of
the principle of minimizing Extended Stochastic Complexity (ESC),
and with it we are able to construct decision lists that have
fewer errors in classification. The accuracy of classification
achieved with our method appears better than or comparable to
those of existing rule-based methods.},
}
@article{Li02,
author = {Hang Li and Kenji Yamanishi},
title = {Text classification using {ESC}-based stochastic decision lists},
journal = {Information Processing and Management},
pages = {343--361},
year = {2002},
number = {3},
volume = {38},
url = {},
abstract = {We propose a new method of text classification using stochastic
decision lists. A stochastic decision list is an ordered sequence
of IF-THEN-ELSE rules, and our method can be viewed as a
rule-based method for text classification having advantages of
readability and refinability of acquired knowledge. Our method is
unique in that decision lists are automatically constructed on
the basis of the principle of minimizing extended stochastic
complexity (ESC), and with it we are able to construct decision
lists that have fewer errors in classification. The accuracy of
classification achieved with our method appears better than or
comparable to those of existing rule-based methods. We have
empirically demonstrated that rule-based methods like ours result
in high classification accuracy when the categories to which
texts are to be assigned are relatively specific ones and when
the texts tend to be short. We have also empirically verified the
advantages of rule-based methods over non-rule-based ones.},
}
@inProceedings{Li02a,
author = {Xin Li and Dan Roth},
title = {Learning question classifiers},
booktitle = {Proceedings of COLING-02, 19th International Conference on
Computational Linguistics},
editor = {},
publisher = {},
address = {Taipei, {TW}},
url = {http://l2r.cs.uiuc.edu/~danr/Papers/qc-coling02.pdf},
year = {2002},
abstract = {In order to respond correctly to a free form factual question
given a large collection of texts, one needs to understand the
question to a level that allows determining some of the
constraints the question imposes on a possible answer. These
constraints may include a semantic classification of the sought
after answer and may even suggest using different strategies when
looking for and verifying a candidate answer. This paper presents
a machine learning approach to question classification. We learn
a hierarchical classi- fier that is guided by a layered semantic
hierarchy of answer types, and eventually classifies questions
into finegrained classes. We show accurate results on a large
collection of free-form questions used in TREC 10.},
}
@inProceedings{Li91,
author = {Wei Li and B. Lee and F. Krausz and K. Sahin},
title = {Text classification by a neural network},
booktitle = {Proceedings of the 23rd Annual Summer Computer Simulation
Conference},
editor = {},
publisher = {},
address = {Baltimore, {US}},
pages = {313--318},
year = {1991},
url = {},
abstract = {When banks process their free-form telex traffic, the first task
is the classification of the telexes. Historically, several
attempts have been made to automate this process, using various
stock phrases as the features on which to base the
classification. This is a problem in which there are large
amounts of data available, but the rules for classification are
not explicitly available. For solving these kinds of problems,
neural networks have the advantage of extracting the underlying
relationships between the input data and the output classes
automatically. Based on this consideration, the authors have
built a neural network classification system, which has three
subsystems: a user-maintainable feature definition subsystem, a
feature extraction subsystem, and a neural network subsystem. The
neural network is simulated on a VAX computer with a fast
learning algorithm, and is combined with some non-statistical
knowledge from the feature definition system. Above 90\% correct
recognition rates have been achieved for the major categories
concerned. The system is also applicable to text classification
problems other than telex classification.},
}
@article{Li98a,
author = {Li, Yong H. and Jain, Anil K.},
title = {Classification of text documents},
journal = {The Computer Journal},
year = {1998},
volume = {41},
number = {8},
pages = {537--546},
url = {},
abstract = {The exponential growth of the Internet has led to a great deal of
interest in developing useful and efficient tools and software to
assist users in searching the Web. Document retrieval,
categorization, routing and filtering can all be formulated as
classification problems. However, the complexity of natural
languages and the extremely high dimensionality of the feature
space of documents have made this classification problem very
difficult. We investigate four different methods for document
classification: the naive Bayes classifier, the nearest neighbour
classifier, decision trees and a subspace method. These were
applied to seven-class Yahoo news groups (business,
entertainment, health, international, politics, sports and
technology) individually and in combination, We studied three
classifier combination approaches: simple voting, dynamic
classifier selection and adaptive classifier combination. Our
experimental results indicate that the naive Bayes classifier and
the subspace method outperform the other two classifiers on our
data sets. Combinations of multiple classifiers did not always
improve the classification accuracy compared to the best
individual classifier. Among the three different combination
approaches, our adaptive classifier combination method introduced
here performed the best.},
}
@inProceedings{Li03,
author = {Cong Li and Ji-Rong Wen and Hang Li},
title = {Text Classification Using Stochastic Keyword Generation},
booktitle = {Proceedings of ICML-03, 20th International Conference on Machine
Learning},
editor = {},
year = {2003},
address = {Washington, {DC}},
pages = {},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {},
abstract = {},
}
@inProceedings{Li03a,
author = {Fan Li and Yiming Yang},
title = {A Loss Function Analysis for Classification Methods in Text
Categorization},
booktitle = {Proceedings of ICML-03, 20th International Conference on Machine
Learning},
editor = {},
year = {2003},
address = {Washington, {DC}},
pages = {},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {},
abstract = {},
}
@inProceedings{Liao02,
author = {Yihua Liao and V. Rao Vemuri},
title = {Using Text Categorization Techniques for Intrusion Detection},
booktitle = {Proceedings of the 11th USENIX Security Symposium},
publisher = {},
editor = {Dan Boneh},
year = {2002},
address = {San Francisco, {US}},
pages = {51--59},
url = {http://www.usenix.org/publications/library/proceedings/sec02/liao.html},
abstract = {A new approach, based on the k-Nearest Neighbor (kNN) classifier,
is used to classify program behavior as normal or intrusive.
Short sequences of system calls have been used by others to
characterize a program's normal behavior before. However,
separate databases of short system call sequences have to be
built for different programs, and learning program profiles
involves time-consuming training and testing processes. With the
kNN classifier, the frequencies of system calls are used to
describe the program behavior. Text categorization techniques are
adopted to convert each process to a vector and calculate the
similarity between two program activities. Since there is no need
to learn individual program profiles separately, the calculation
involved is largely reduced. Preliminary experiments with 1998
DARPA BSM audit data show that the kNN classifier can effectively
detect intrusive attacks and achieve a low false positive rate.},
}
@article{Liddy94,
author = {Elizabeth D. Liddy and Woojin Paik and Edmund S. Yu},
title = {Text categorization for multiple users based on semantic features
from a machine-readable dictionary},
journal = {{ACM} Transactions on Information Systems},
year = {1994},
number = {3},
volume = {12},
pages = {278--295},
url = {http://www.acm.org/pubs/articles/journals/tois/1994-12-3/p278-liddy/p278-liddy.pdf},
abstract = {The text categorization module described in the paper provides a
front-end filtering function for the larger DR-LINK text
retrieval system (Liddy and Myaeng 1993). The module evaluates a
large incoming stream of documents to determine which documents
are sufficiently similar to a profile at the broad subject level
to warrant more refined representation and matching. To
accomplish this task, each substantive word in a text is first
categorized using a feature set based on the semantic subject
field codes (SFCs) assigned to individual word senses in a
machine-readable dictionary. When tested on 50 user profiles and
550 megabytes of documents, results indicate that the feature set
that is the basis of the text categorization module and the
algorithm that establishes the boundary of categories of
potentially relevant documents accomplish their tasks with a high
level of performance. This means that the category of potentially
relevant documents for most profiles would contain at least 80\%
of all documents later determined to be relevant to the profile.
The number of documents in this set would be uniquely determined
by the system's category-boundary predictor, and this set is
likely to contain less than 5\% of the incoming stream of
documents.},
}
@inProceedings{Liere97,
author = {Ray Liere and Prasad Tadepalli},
title = {Active learning with committees for text categorization},
booktitle = {Proceedings of AAAI-97, 14th Conference of the American
Association for Artificial Intelligence},
editor = {},
publisher = {{AAAI} Press, Menlo Park, {US}},
year = {1997},
pages = {591--596},
address = {Providence, {US}},
url = {http://www.rdrop.com/~lierer/aaai97.ps},
abstract = {In many real-world domains, supervised learning requires a large
number of training examples. In this paper, we describe an active
learning method that uses a committee of learners to reduce the
number of training examples required for learning. Our approach
is similar to the Query by Committee framework, where
disagreement among the committee members on the predicted label
for the input part of the example is used to signal the need for
knowing the actual value of the label. Our experiments are
conducted in the text categorization domain, which is
characterized by a large number of features, many of which are
irrelevant. We report here on experiments using a committee of
Winnow-based learners and demonstrate that this approach can
reduce the number of labeled training examples required over that
used by a single Winnow learner by 1-2 orders of magnitude.},
}
@inProceedings{Liere98,
author = {Ray Liere and Prasad Tadepalli},
title = {Active Learning with Committees: Preliminary Results in Comparing
{W}innow and {P}erceptron in Text Categorization},
booktitle = {Proceedings of CONALD-98, 1st Conference on Automated Learning
and Discovery},
editor = {},
publisher = {{AAAI} Press, Menlo Park, {US}},
year = {1998},
pages = {},
address = {Pittsburgh, {US}},
url = {http://www.rdrop.com/~lierer/conald98.ps},
abstract = {The availability of vast amounts of information on the World Wide
Web has created a big demand for automatic tools to organize and
index that information. Unfortunately, the paradigm of supervised
machine learning is ill-suited to this task, as it assumes that
the training examples are classified by a teacher usually a
human. In this paper, we describe an active learning method based
on Query by Committee (QBC) that reduces the number of labeled
training examples (text documents) required for learning by 1-2
orders of magnitude.},
}
@inProceedings{Lim99,
author = {Lim, Joo Hwee},
title = {Learnable visual keywords for image classification},
booktitle = {Proceedings of DL-99, 4th ACM Conference on Digital Libraries},
editor = {Edward A. Fox and Neil Rowe},
publisher = {{ACM} Press, New York, {US}},
year = {1999},
address = {Berkeley, {US}},
pages = {139--145},
url = {http://www.acm.org/pubs/articles/proceedings/dl/313238/p139-lim/p139-lim.pdf},
abstract = {Automatic categorization of multimedia documents is an important
function for a digital library system. While text categorization
has received much attentions by IR researchers, classification of
visual data is at its infancy stage. In this paper, we propose a
notion of visual keywords for similarity matching between visual
contents. Visual keywords can be constructed automatically from
samples of visual data through supervised/unsupervised learning.
Given a visual content, the occurrences of visual keywords are
detected, summarized spatially, and coded via singular value
decomposition to arrive at a concise coded description. The
methods to create, detect, summarize, select, and code visual
keywords will be detailed. Last but not least, we describe an
evaluation experiment that classifies professional nature scenery
photographs to demonstrate the effectiveness and efficiency of
visual keywords for automatic categorization of images in digital
libraries.},
}
@inProceedings{Liu02,
author = {Yan Liu and Yiming Yang and Jaime Carbonell},
title = {Boosting to Correct the Inductive Bias for Text Classification},
booktitle = {Proceedings of CIKM-02, 11th ACM International Conference on
Information and Knowledge Management},
publisher = {{ACM} Press, New York, {US}},
editor = {},
year = {2002},
address = {McLean, {US}},
pages = {348 - 355},
url = {http://doi.acm.org/10.1145/584792.584850},
abstract = {This paper studies the effects of boosting in the context of
different classification methods for text categorization,
including Decision Trees, Naive Bayes, Support Vector Machines
(SVMs) and a Rocchio-style classifier. We identify the inductive
biases of each classifier and explore how boosting, as an
error-driven resampling mechanism, reacts to those biases. Our
experiments on the Reuters-21578 benchmark show that boosting is
not effective in improving the performance of the base
classifiers on common categories. However, the effect of boosting
for rare categories varies across classifiers: for SVMs and
Decision Trees, we achieved a 13-17\% performance improvement in
macro-averaged F1 measure, but did not obtain substantial
improvement for the other two classifiers. This interesting
finding of boosting on rare categories has not been reported
before.},
}
@article{Lodhi02,
author = {Huma Lodhi and Craig Saunders and John Shawe-Taylor and Nello
Cristianini and Chris Watkins},
title = {Text Classification using String Kernels},
journal = {Journal of Machine Learning Research},
volume = {2},
pages = {419--444},
year = {2002},
url = {http://www.ai.mit.edu/projects/jmlr/papers/volume2/lodhi02a/lodhi02a.pdf},
abstract = {We propose a novel approach for categorizing text documents based
on the use of a special kernel. The kernel is an inner product in
the feature space generated by all subsequences of length k. A
subsequence is any ordered sequence of k characters occurring in
the text though not necessarily contiguously. The subsequences
are weighted by an exponentially decaying factor of their full
length in the text, hence emphasising those occurrences that are
close to contiguous. A direct computation of this feature vector
would involve a prohibitive amount of computation even for modest
values of k, since the dimension of the feature space grows
exponentially with k. The paper describes how despite this fact
the inner product can be efficiently evaluated by a dynamic
programming technique. Experimental comparisons of the
performance of the kernel compared with a standard word feature
space kernel (Joachims, 1998) show positive results on modestly
sized datasets. The case of contiguous subsequences is also
considered for comparison with the subsequences kernel with
different decay factors. For larger documents and datasets the
paper introduces an approximation technique that is shown to
deliver good approximations efficiently for large datasets.},
}
@inProceedings{Macskassy01,
author = {Sofus A. Macskassy and Haym Hirsh and Arunava Banerjee and Aynur
A. Dayanik},
title = {Using Text Classifiers for Numerical Classification},
booktitle = {Proceeding of IJCAI-01, 17th International Joint Conference on
Artificial Intelligence},
editor = {Bernhard Nebel},
address = {Seattle, {US}},
year = {2001},
pages = {885--890},
url = {http://www.cs.rutgers.edu/~sofmac/paper/ijcai2001/macskassy-ijcai2001.pdf},
abstract = {Consider a supervised learning problem in which examples contain
both numerical- and text-valued features. To use traditional
feature-vector- based learning methods, one could treat the
presence or ab-sence of a word as a Boolean feature and use these
binary-valued features together with the numerical features.
However, the use of a text-classification system on this is a bit
more problematic ‹ in the most straight-forward approach each
number would be considered a distinct token and treated as a
word. This paper presents an alter-native approach for the use of
text classification methods for super-vised learning problems
with numerical-valued features in which the numerical features
are converted into bag-of-words features, thereby making them
directly usable by text classification methods. We show that even
on purely numerical-valued data the results of
text-classification on the derived text-like representation
outperforms the more naive numbers-as-tokens representation and,
more importantly, is competitive with mature numerical
classification methods such as C4.5 and Ripper.},
}
@article{Maderlechner97,
author = {Maderlechner, G. and Suda, P. and Bruckner, T.},
title = {Classification of documents by form and content},
journal = {Pattern Recognition Letters},
pages = {1225--1231},
year = {1997},
volume = {18},
number = {11/13},
url = {},
abstract = {This paper presents a modular software system, which classifies a
large variety of office documents according to layout form and
textual content. It consists of the following components: layout
analysis, pre-classification, OCR interface, fuzzy string
matching, text categorization, lexical, syntactical and semantic
analysis. The system has been applied to the following tasks:
presorting of forms, reports and letters, index extraction for
archiving and retrieval, page type classification and text column
analysis of real estate register documents, in-house mail sorting
and electronic distribution to departments. The architecture,
modules, and practical results are described.},
}
@article{Manevitz01,
author = {Larry M. Manevitz and Malik Yousef},
title = {One-Class {SVMs} for Document Classification},
journal = {Journal of Machine Learning Research},
volume = {2},
month = {December},
pages = {139--154},
year = {2001},
url = {http://www.ai.mit.edu/projects/jmlr/papers/volume2/manevitz01a/manevitz01a.pdf},
abstract = {We implemented versions of the SVM appropriate for one-class
classification in the context of information retrieval. The
experiments were conducted on the standard Reuters data set. For
the SVM implementation we used both a version of Schoelkopf et
al. and a somewhat different version of one-class SVM based on
identifying ``outlier" data as representative of the
second-class. We report on experiments with different kernels for
both of these implementations and with different representations
of the data, including binary vectors, tf-idf representation and
a modification called ``Hadamard" representation. Then we
compared it with one-class versions of the algorithms prototype
(Rocchio), nearest neighbor, naive Bayes, and finally a natural
one-class neural network classification method based on
``bottleneck" compression generated filters. The SVM approach as
represented by Schoelkopf was superior to all the methods except
the neural network one, where it was, although occasionally
worse, essentially comparable. However, the SVM methods turned
out to be quite sensitive to the choice of representation and
kernel in ways which are not well understood; therefore, for the
time being leaving the neural network approach as the most
robust.},
}
@inBook{Manning99a,
author = {Christopher Manning and Hinrich Sch{\"{u}}tze},
title = {Foundations of Statistical Natural Language Processing},
publisher = {The {MIT} Press},
address = {Cambridge, {US}},
year = {1999},
chapter = {16: Text Categorization},
pages = {575--608},
url = {},
abstract = {},
}
@article{Maron61,
author = {M.E. Maron},
title = {Automatic indexing: an experimental inquiry},
year = {1961},
journal = {Journal of the Association for Computing Machinery},
volume = {8},
number = {3},
pages = {404--417},
url = {http://www.acm.org/pubs/articles/journals/jacm/1961-8-3/p404-maron/p404-maron.pdf},
abstract = {This inquiry examines a technique for automatically classifying
(indexing) documents according to their subject content. The
task, in essence, is to have a computing machine read a document
and on the basis of the occurrence of selected clue words decide
to which of many subject categories the document in question
belongs. This paper describes the design, execution and
evaluation of a modest experimental study aimed at testing
empirically one statistical technique for automatic indexing.},
}
@inProceedings{Masand92,
author = {Briji Masand and Gordon Linoff and David Waltz},
title = {Classifying news stories using memory-based reasoning},
booktitle = {Proceedings of SIGIR-92, 15th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Nicholas J. Belkin and Peter Ingwersen and Annelise Mark
Pejtersen},
publisher = {{ACM} Press, New York, {US}},
address = {Kobenhavn, {DK}},
pages = {59--65},
year = {1992},
url = {http://www.acm.org/pubs/articles/proceedings/ir/133160/p59-masand/p59-masand.pdf},
abstract = {We describe a method for classifying news stories using Memory
Based Reasoning (MBR) a k-nearest neighbor method), that does not
require manual topic definitions. Using an already coded training
database of about 50,000 stories from the Dow Jones Press Release
News Wire, and SEEKER [Stanfill] (a text retrieval system that
supports relevance feedback) as the underlying match engine,
codes are assigned to new, unseen stories with a recall of about
80\% and precision of about 70\%. There are about 350 different
codes to be assigned. Using a massively parallel supercomputer,
we leverage the information already contained in the thousands of
coded stories and are able to code a story in about 2 seconds.
Given SEEKER, the text retrieval system, we achieved these
results in about two person-months. We believe this approach is
effective in reducing the development time to implement
classification systems involving large number of topics for the
purpose of classification, message routing etc.},
}
@inCollection{Masand94,
author = {Briji Masand},
title = {Optimising confidence of text classification by evolution of
symbolic expressions},
booktitle = {Advances in genetic programming},
publisher = {The {MIT} Press},
address = {Cambridge, {US}},
year = {1994},
chapter = {21},
editor = {Kenneth E. Kinnear},
pages = {459--476},
url = {},
abstract = {},
}
@inProceedings{Matsuda98,
author = {Katsushi Matsuda and Toshikazu Fukushima},
title = {Task-oriented {W}orld {W}ide {W}eb retrieval by document type
classification},
booktitle = {Proceedings of CIKM-98, 7th ACM International Conference on
Information and Knowledge Management},
publisher = {{ACM} Press, New York, {US}},
editor = {Georges Gardarin and James C. French and Niki Pissinou and Kia
Makki and Luc Bouganim},
year = {1998},
address = {Bethesda, {US}},
pages = {109--113},
url = {http://www.acm.org/pubs/articles/proceedings/cikm/319950/p109-matsuda/p109-matsuda.pdf},
abstract = {This paper proposes a novel approach to accurately searching Web
pages for relevant information in problem solving by specifying a
Web document category instead of the user¹s task. Accessing
information from World Wide Web pages as an approach to problem
solving has become commonplace. However, such a search is
difficult with current search services, since these services only
provide keyword-based search methods that are equivalent to
narrowing down the target references according to domains.
However, problem solving usually involves both a domain and a
task. Accordingly,¹ our approach is based on problem solving
tasks. To specify a user¹s problem solving task, we introduce the
concept of document types that directly relate to the problem
solving tasks; with this approach, users can easily designate
problem solving tasks. We implemented PageTypeSearch system based
on our approach. Classifier of PageTypeSearch classifies Web
pages into the document types by comparing their pages with
typical structural characteristics of the types. We compare
PageTypeSearch using the document type-indices with a
conventional keyword-based search system in experiments. The
average precision of the document type-based search is 88.9\%,
while the average precision of the keyword-based search is
31.2\%. Moreover, the number of irrelevant references gathered by
our system is about one-thirteenth that of traditional
keyword-based search systems. Our approach has practical
advantages for problem solving by introducing the viewpoint of
tasks to achieve higher performance.},
}
@inProceedings{McCallum98,
author = {Andrew K. McCallum and Kamal Nigam},
title = {Employing {EM} in pool-based active learning for text
classification},
booktitle = {Proceedings of ICML-98, 15th International Conference on Machine
Learning},
editor = {Jude W. Shavlik},
year = {1998},
address = {Madison, {US}},
pages = {350--358},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {http://www.cs.cmu.edu/~mccallum/papers/emactive-icml98.ps.gz},
abstract = {The paper shows how a text classifier's need for labeled training
documents can be reduced by taking advantage of a large pool of
unlabeled documents. We modify the Query-by-Committee (QBC)
method of active learning to use the unlabeled pool for
explicitly estimating document density when selecting examples
for labeling. Then active learning is combined with
Expectation-Maximization in order to ``fill in'' the class labels
of those documents that remain unlabeled. Experimental results
show that the improvements to active learning require less than
two-thirds as many labeled training examples as previous QBC
approaches, and that the combination of EM and active learning
requires only slightly more than half as many labeled training
examples to achieve the same accuracy as either the improved
active learning or EM alone.},
}
@inProceedings{McCallum98b,
author = {Andrew K. McCallum and Ronald Rosenfeld and Tom M. Mitchell and
Andrew Y. Ng},
title = {Improving text classification by shrinkage in a hierarchy of
classes},
booktitle = {Proceedings of ICML-98, 15th International Conference on Machine
Learning},
editor = {Jude W. Shavlik},
year = {1998},
address = {Madison, {US}},
pages = {359--367},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {http://www.cs.cmu.edu/~mccallum/papers/hier-icml98.ps.gz},
abstract = {When documents are organized in a large number of topic
categories, the categories are often arranged in a hierarchy. The
US patent database and Yahoo are two examples. The paper shows
that the accuracy of a naive Bayes text classifier can be
significantly improved by taking advantage of a hierarchy of
classes. We adopt an established statistical technique called
shrinkage that smooths parameter estimates of a data-sparse child
with its parent in order to obtain more robust parameter
estimates. The approach is also employed in deleted
interpolation, a technique for smoothing n-grams in language
modeling for speech recognition. Our method scales well to large
data sets, with numerous categories in large hierarchies.
Experimental results on three real world data sets from UseNet,
Yahoo, and corporate Web pages show improved performance, with a
reduction in error up to 29\% over the traditional flat
classifier.},
}
@inProceedings{Meretakis00,
author = {Dimitris Meretakis and Dimitris Fragoudis and Hongjun Lu and
Spiros Likothanassis},
title = {Scalable Association-based Text Classification},
booktitle = {Proceedings of CIKM-00, 9th ACM International Conference on
Information and Knowledge Management},
publisher = {{ACM} Press, New York, {US}},
address = {McLean, {US}},
editor = {Arvin Agah and Jamie Callan and Elke Rundensteiner},
year = {2000},
pages = {373--374},
url = {http://www.cs.ust.hk/~meretaks/papers/mfll-cikm2000.pdf},
abstract = {Naive Bayes (NB) classifier has long been considered a core
methodology in text classification mainly due to its simplicity
and computational efficiency. There is an increasing need however
for methods that can achieve higher classification accuracy while
maintaining the ability to process large document collections. In
this paper we examine text categorization methods from a
perspective that considers the tradeoff between accuracy and
scalability to large data sets and large feature sizes. We start
from the observation that Support Vector Machines, one of the
best text categorization methods cannot scale up to handle the
large document collections involved in many real word problems.
We then consider bayesian extensions to NB that achieve higher
accuracy by relaxing its strong independence assumptions. Our
experimental results show that LB, an association-based lazy
classifier can achieve a good tradeoff between high
classification accuracy and scalability to large document
collections and large feature sizes.},
}
@article{Merkl98,
author = {Merkl, Dieter},
title = {Text classification with self-organizing maps: Some lessons
learned},
journal = {Neurocomputing},
year = {1998},
volume = {21},
number = {1/3},
pages = {61--77},
url = {},
abstract = {We discuss ways of using self-organizing maps for document
classification. Furthermore, we focus on the fact that document
collections lend themselves naturally to a hierarchical structure
defined by the subject matter of the documents. We take advantage
of this fact by using a hierarchically organized neural network,
built up from a number of independent self-organizing maps in
order to enable the true establishment of a document taxonomy.
Using such an architecture, the time needed for training is
reduced substantially and the user is provided with an even more
intuitive metaphor for visualization. Since the single layers of
self-organizing maps represent different aspects of the document
collection at different levels of detail, the neural network
shows the document collection in a form comparable to an atlas
where the user may easily select the most appropriate degree of
granularity depending on the actual focus of interest during the
exploration of the document collection.},
}
@inProceedings{Mladenic98a,
author = {Dunja Mladeni{\'{c}}},
title = {Turning {{\sc Yahoo!}}\ into an automatic {W}eb page classifier},
booktitle = {Proceedings of ECAI-98, 13th European Conference on Artificial
Intelligence},
publisher = {John Wiley and Sons, Chichester, {UK}},
editor = {Henri Prade},
year = {1998},
pages = {473--474},
address = {Brighton, {UK}},
url = {http://www-ai.ijs.si/DunjaMladenic/papers/PWW/pwwECAI98yr.ps.gz},
abstract = {The paper describes an approach to automatic Web-page
classification based on the Yahoo hierarchy. Machine learning
techniques developed for learning on text data are used here on
the hierarchical classification structure. The high number of
features is reduced by taking into account the hierarchical
structure and using feature subset selection based on the method
known from information retrieval. Documents are represented as
feature-vectors that include n-grams instead of including only
single words (unigrams) as commonly used when learning on text
data. Based on the hierarchical structure the problem is divided
into subproblems, each representing one on the categories
included in the Yahoo hierarchy. The result of learning is a set
of independent classifiers, each used to predict the probability
that a new example is a member of the corresponding category.
Experimental evaluation on real-world data shows that the
proposed approach gives good results. For more than a half of
testing examples a correct category is among the 3 categories
with the highest predicted probability.},
}
@inProceedings{Mladenic98b,
author = {Dunja Mladeni{\'{c}}},
title = {Feature subset selection in text learning},
booktitle = {Proceedings of ECML-98, 10th European Conference on Machine
Learning},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 1398},
editor = {Claire N{\'{e}}dellec and C{\'{e}}line Rouveirol},
address = {Chemnitz, {DE}},
pages = {95--100},
year = {1998},
url = {http://www-ai.ijs.si/DunjaMladenic/papers/PWW/pwwECML98.ps.gz},
abstract = {This paper describes several known and some new methods for
feature subset selection on large text data. Experimental
comparison given on real-world data collected from Web users
shows that characteristics of the problem domain and machine
learning algorithm should be considered when feature scoring
measure is selected. Our problem domain consists of hyperlinks
given in a form of small-documents represented with word vectors.
In our learning experiments naive Bayesian classifier was used on
text data. The best performance was achieved by the feature
selection methods based on the feature scoring measure called
Odds ratio that is known from information retrieval.},
}
@phdThesis{Mladenic98c,
author = {Dunja Mladeni{\'{c}}},
title = {Machine Learning on non-homogeneous, distributed text data},
school = {J.\ Stefan Institute, University of Ljubljana},
address = {Ljubljana, {SL}},
year = {1998},
url = {http://www-ai.ijs.si/DunjaMladenic/papers/PhD/PhDFinal.ps},
abstract = {},
}
@article{Mladenic99,
author = {Dunja Mladeni{\'{c}}},
title = {Text learning and related intelligent agents: a survey},
journal = {{IEEE} Intelligent Systems},
year = {1999},
number = {4},
volume = {14},
pages = {44--54},
url = {http://www-ai.ijs.si/DunjaMladenic/papers/PWW/agentOverIEEE.ps.gz},
abstract = {Analysis of text data using intelligent information retrieval,
machine learning, natural language processing or other related
methods is becoming an important issue for the development of
intelligent agents. There are two frequently used approaches to
the development of intelligent agents using machine learning
techniques: a content-based and a collaborative approach. In the
first approach, the content (eg., text) plays an important role,
while in the second approach, the existence of several knowledge
sources (eg., several users) is required. We can say that the
usage of machine learning techniques on text databases (usually
referred to as text-learning) is an important part of the
content-based approach. Examples are agents for locating
information on World Wide Web and Usenet news filtering agents.
There are different research questions important for the
development of text-learning intelligent agents. We focus on
three of them: what representation is used for documents, how is
the high number of features dealt with and which learning
algorithm is used. These questions are addressed in an overview
of the existing approaches to text classification. For
illustration we give a brief description of the content-based
personal intelligent agent named Personal WebWatcher that uses
text-learning for user customized Web browsing.},
}
@inProceedings{Mladenic98d,
author = {Dunja Mladeni{\'{c}} and Marko Grobelnik},
title = {Word sequences as features in text-learning},
booktitle = {Proceedings of ERK-98, the Seventh Electrotechnical and Computer
Science Conference},
year = {1998},
address = {Ljubljana, {SL}},
pages = {145--148},
}
@inProceedings{Mladenic99a,
author = {Dunja Mladeni{\'{c}} and Marko Grobelnik},
title = {Feature selection for unbalanced class distribution and Naive
{B}ayes},
booktitle = {Proceedings of ICML-99, 16th International Conference on Machine
Learning},
editor = {Ivan Bratko and Saso Dzeroski},
year = {1999},
address = {Bled, {SL}},
pages = {258--267},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {http://www-ai.ijs.si/DunjaMladenic/papers/PWW/pwwICML99Final.ps.gz},
abstract = {This paper describes an approach to feature subset selection that
takes into account problem specifics and learning algorithm
characteristics. It is developed for the Naive Bayesian
classifier applied on text data, since it combines well with the
addressed learning problems. We focus on domains with many
features that also have a highly unbalanced class distribution
and asymmetric misclassification costs given only implicitly in
the problem. By asymmetric misclassification costs we mean that
one of the class values is the target class value for which we
want to get predictions and we prefer false positive over false
negative. Our example problem is automatic document
categorization using machine learning, where we want to identify
documents relevant for the selected category. Usually, only about
1\%-10\% of examples belong to the selected category. Our
experimental comparison of eleven feature scoring measures show
that considering domain and algorithm characteristics
significantly improves the results of classification.},
}
@article{Mladenic03,
author = {Dunja Mladeni{\'{c}} and Marko Grobelnik},
title = {Feature selection on hierarchy of {W}eb documents},
journal = {Decision Support Systems},
year = {2003},
number = {1},
volume = {35},
pages = {45--87},
url = {},
abstract = {The paper describes feature subset selection used in learning on
text data (text learning) and gives a brief overview of feature
subset selection commonly used in machine learning. Several known
and some new feature scoring measures appropriate for feature
subset selection on large text data are described and related to
each other. Experimental comparison of the described measures is
given on real-world data collected from the Web. Machine learning
techniques are used on data collected from Yahoo, a large text
hierarchy of Web documents. Our approach includes some original
ideas for handling large number of features, categories and
documents. The high number of features is reduced by feature
subset selection and additionally by using `stop-list', pruning
low-frequency features and using a short description of each
document given in the hierarchy instead of using the document
itself. Documents are represented as feature-vectors that include
word sequences instead of including only single words as commonly
used when learning on text data. An efficient approach to
generating word sequences is proposed. Based on the hierarchical
structure, we propose a way of dividing the problem into
subproblems, each representing one of the categories included in
the Yahoo hierarchy. In our learning experiments, for each of the
subproblems, naive Bayesian classifier was used on text data. The
result of learning is a set of independent classifiers, each used
to predict probability that a new example is a member of the
corresponding category. Experimental evaluation on real-world
data shows that the proposed approach gives good results. The
best performance was achieved by the feature selection based on a
feature scoring measure known from information retrieval called
Odds ratio and using relatively small number of features.},
}
@article{Moens00,
author = {Marie-Francine Moens and Jos Dumortier},
title = {Text categorization: the assignment of subject descriptors to
magazine articles},
journal = {Information Processing and Management},
pages = {841--861},
year = {2000},
number = {6},
volume = {36},
url = {},
abstract = {Automatic text categorization is an important research area and
has a potential for many text-based applications including text
routing and filtering. Typical text classifiers learn from
example texts that are manually categorized. When categorizing
magazine articles with broad subject descriptors, we study three
aspects of text classification: (1) effective selection of
feature words and proper names that reflect the main topics of
the text; (2) learning algorithms; and (3) improvement of the
quality of the learned classifier by selection of examples. The
chi(2) test, which is sometimes used for selecting terms that are
highly related to a text class, is applied in a novel way when
constructing a category weight vector. Despite a limited number
of training examples, combining an effective feature selection
with the chi(2) learning algorithm for training the text
classifier results in an adequate categorization of new magazine
articles.},
}
@inProceedings{Mooney00,
author = {Raymond J. Mooney and Loriene Roy},
title = {Content-based book recommending using learning for text
categorization},
booktitle = {Proceedings of DL-00, 5th ACM Conference on Digital Libraries},
editor = {},
publisher = {{ACM} Press, New York, {US}},
year = {2000},
address = {San Antonio, {US}},
pages = {195--204},
url = {ftp://ftp.cs.utexas.edu/pub/mooney/papers/libra-dl-00.ps.gz},
abstract = {Recommender systems improve access to relevant products and
information by making personalized suggestions based on previous
examples of a user's likes and dislikes. Most existing
recommender systems use collaborative filtering methods that base
recommendations on other users' preferences. By contrast,
content-based methods use information about an item itself to
make suggestions. This approach has the advantage of being able
to recommend previously unrated items to users with unique
interests and to provide explanations for its recommendations. We
describe a content-based book recommending system that utilizes
information extraction and a machine-learning algorithm for text
categorization. Initial experimental results demonstrate that
this approach can produce accurate recommendations.},
}
@inProceedings{Moschitti03,
author = {Alessandro Moschitti},
title = {A study on optimal parameter tuning for {R}occhio text classifier},
booktitle = {Proceedings of ECIR-03, 25th European Conference on Information
Retrieval},
publisher = {Springer Verlag},
editor = {Fabrizio Sebastiani},
address = {Pisa, {IT}},
year = {2003},
pages = {420--435},
url = {http://link.springer.de/link/service/series/0558/papers/2633/26330420.pdf},
abstract = {Current trend in operational text categorization is the designing
of fast classification tools. Several studies on improving
accuracy of fast but less accurate classifiers have been recently
carried out. In particular, enhanced versions of the Rocchio text
classifier, characterized by high performance, have been
proposed. However, even in these extended formulations the
problem of tuning its parameters is still neglected. In this
paper, a study on parameters of the Rocchio text classifier has
been carried out to achieve its maximal accuracy. The result is a
model for the automatic selection of parameters. Its main feature
is to bind the searching space so that optimal parameters can be
selected quickly. The space has been bound by giving a feature
selection interpretation of the Rocchio parameters. The benefit
of the approach has been assessed via extensive cross evaluation
over three corpora in two languages. Comparative analysis shows
that the performances achieved are relatively close to the best
TC models (e.g. Support Vector Machines).},
}
@article{Mostafa00,
author = {Javed Mostafa and Wai Lam},
title = {Automatic classification using supervised learning in a medical
document filtering application},
journal = {Information Processing and Management},
year = {2000},
volume = {36},
number = {3},
pages = {415--444},
url = {},
abstract = {Document classifiers can play an intermediate role in multilevel
filtering systems. The effectiveness of a classifier that uses
supervised learning was analyzed in terms of its accuracy and
ultimately its influence on filtering. The analysis was conducted
in two phases. In the first phase, a multilayer feed-forward
neural network was trained to classify medical documents in the
area of cell biology. The accuracy of the supervised classifier
was established by comparing its performance with a baseline
system that uses human classification information. A relatively
high degree of accuracy was achieved by the supervised method,
however, classification accuracy varied across classes. In the
second phase, to clarify the impact of this performance on
filtering, different types of user profiles were created by
grouping subsets of classes based on their individual
classification accuracy rates. Then, a filtering system with the
neural network integrated into it was used to filter the medical
documents and this performance was compared with the filtering
results achieved using the baseline system. The performance of
the system using the neural network classifier was generally
satisfactory and, as expected, the filtering performance varied
with regard to the accuracy rates of classes.},
}
@inProceedings{Moulinier96a,
author = {Isabelle Moulinier and Jean-Gabriel Ganascia},
title = {Applying an existing machine learning algorithm to text
categorization},
booktitle = {Connectionist, statistical, and symbolic approaches to learning
for natural language processing},
editor = {Stefan Wermter and Ellen Riloff and Gabriele Scheler},
pages = {343--354},
year = {1996},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 1040},
url = {http://www-poleia.lip6.fr/~moulinie/wijcai.ps.gz},
abstract = {The information retrieval community is becoming increasingly
interested in machine learning techniques, of which text
categorization is an application. This paper describes how we
have applied an existing similarity-based learning algorithm,
CHARADE, to the text categorization problem and compares the
results with those obtained using decision tree construction
algorithms. From a machine learning point of view, this study was
motivated by the size of the inspected data in such applications.
Using the same representation of documents, CHARADE offers better
performance than earlier reported experiments with decision trees
on the same corpus. In addition, the way in which learning with
redundancy influences categorization performance is also studied.},
}
@inProceedings{Moulinier96,
author = {Isabelle Moulinier and Gailius Ra{\u{s}}kinis and Jean-Gabriel
Ganascia},
title = {Text categorization: a symbolic approach},
booktitle = {Proceedings of SDAIR-96, 5th Annual Symposium on Document
Analysis and Information Retrieval},
publisher = {},
editor = {},
address = {Las Vegas, {US}},
year = {1996},
pages = {87--99},
url = {http://www-poleia.lip6.fr/~moulinie/sdair.ps.gz},
abstract = {Recent research in machine learning has been concerned with
scaling-up to large data sets. Since information retrieval is a
domain where such data sets are widespread, it provides an ideal
application area for machine learning. This paper studies the
ability of symbolic learning algorithms to perform a text
categorization task. This ability depends on both text
representation and feature filtering. We present a unified view
of text categorization systems, focusing on the selection of
features. A new selection technique, SCAR, is proposed for k-DNF
(disjunctive normal form) learners and evaluated on the Reuters
financial data set. Even though our experimental results do not
outperform earlier approaches, they give rise to promising
perspectives.},
}
@inProceedings{Moulinier97,
author = {Isabelle Moulinier},
title = {Feature selection: a useful preprocessing step},
booktitle = {Proceedings of BCSIRSG-97, the 19th Annual Colloquium of the
British Computer Society Information Retrieval Specialist Group},
publisher = {Springer Verlag, Heidelberg, {DE}},
series = {Electronic Workshops in Computing},
editor = {Jonathan Furner and David Harper},
address = {Aberdeen, {UK}},
year = {1997},
pages = {},
url = {http://www.ewic.org.uk/ewic/workshop/fetch.cfm/IRR-97/Moulinier/Moulinier.ps},
abstract = {Statistical classification techniques and machine learning
methods have been applied to some information retrieval (IR)
problems: routing, filtering and categorization. Most of these
methods are usually awkward and sometimes intractable in
high-dimensional feature spaces. In order to reduce
dimensionality, feature selection has been introduced as a
preprocessing step. In this paper, we assess to what extent
feature selection can be used without causing a loss in
effectiveness. This problem can be tackled since a couple of
recent learners (Ripper and Scar) do not require a preprocessing
step. On a text categorization task, using the Reuters-22,173
collection, we give empirical evidence that feature selection is
useful: first, the size of the collection index can be
drastically reduced without causing a significant loss in
categorization effectiveness. Then, we show that feature
selection speeds up the time required to automatically build the
categorization system.},
}
@inProceedings{Myers00,
author = {Kary Myers and Michael Kearns and Satinder Singh and Marilyn A.
Walker},
title = {A Boosting Approach to Topic Spotting on Subdialogues},
booktitle = {Proceedings of ICML-00, 17th International Conference on Machine
Learning},
editor = {Pat Langley},
year = {2000},
address = {Stanford, {US}},
pages = {655--662},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {http://www.cs.cmu.edu/~rayid/mypapers/ecoc-icml.ps},
abstract = {We report the results of a study on topic spotting in
conversational speech. Using a machine learning approach, we
build classifiers that accept an audio file of conversational
human speech as input, and output an estimate of the topic being
discussed. Our methodology makes use of a well-known corpus of
transcribed and topic-labeled speech (the Switchboard corpus),
and involves an interesting double use of the BOOSTEXTER learning
algorithm. Our work is distinguished from previous efforts in
topic spotting by our explicit study of the effects of dialogue
length on classifier performance, and by our use of off-the-shelf
speech recognition technology. One of our main results is the
identification of a single classifier with good performance
(relative to our classifier space) across all subdialogue
lengths.},
}
@inProceedings{Nardiello03,
author = {Pio Nardiello and Fabrizio Sebastiani and Alessandro Sperduti},
title = {Discretizing continuous attributes in {A}da{B}oost for text
categorization},
booktitle = {Proceedings of ECIR-03, 25th European Conference on Information
Retrieval},
publisher = {Springer Verlag},
editor = {Fabrizio Sebastiani},
address = {Pisa, {IT}},
year = {2003},
pages = {320--334},
url = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/ECIR03.pdf},
abstract = {We focus on two recently proposed algorithms in the family of
``boosting''-based learners for automated text classification,
\textsc{AdaBoost.MH} and \textsc{AdaBoost.MH$^{KR}$}. While the
former is a realization of the well-known \textsc{AdaBoost}
algorithm specifically aimed at multi-label text categorization,
the latter is a generalization of the former based on the idea of
learning a committee of classifier sub-committees. Both
algorithms have been among the best performers in text
categorization experiments so far. A problem in the use of both
algorithms is that they require documents to be represented by
binary vectors, indicating presence or absence of the terms in
the document. As a consequence, these algorithms cannot take full
advantage of the ``weighted'' representations (consisting of
vectors of continuous attributes) that are customary in
information retrieval tasks, and that provide a much more
significant rendition of the document's content than binary
representations. In this paper we address the problem of
exploiting the potential of weighted representations in the
context of \textsc{AdaBoost}-like algorithms by discretizing the
continuous attributes through the application of entropy-based
discretization methods. We present experimental results on the
\textsf{Reuters-21578} text categorization collection, showing
that for both algorithms the version with discretized continuous
attributes outperforms the version with traditional binary
representations.},
}
@inProceedings{Ng97,
author = {Hwee T. Ng and Wei B. Goh and Kok L. Low},
title = {Feature selection, perceptron learning, and a usability case
study for text categorization},
booktitle = {Proceedings of SIGIR-97, 20th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Nicholas J. Belkin and A. Desai Narasimhalu and Peter Willett},
publisher = {{ACM} Press, New York, {US}},
year = {1997},
address = {Philadelphia, {US}},
pages = {67--73},
url = {http://www.acm.org/pubs/articles/proceedings/ir/258525/p67-ng/p67-ng.pdf},
abstract = {In this paper, we describe an automated learning approach to text
categorization based on perceptron learning and a new feature
selection metric, called correlation coefficient. Our approach
has been tested on the standard Reuters text categorization
collection. Empirical results indicate that our approach
outperforms the best published results on this Reuters
collection. In particular, our new feature selection method
yields considerable improvement. We also investigate the
usability of our automated learning approach by actually
developing a system that categorizes texts into a tree of
categories. We compare the accuracy of our learning approach to a
rule-based, expert system approach that uses a text
categorization shell built by Carnegie Group. Although our
automated learning approach still gives a lower accuracy, by
appropriately incorporating a set of manually chosen words to use
as features, the combined, semi-automated approach yields
accuracy close to the rule-based approach.},
}
@article{Nieto02,
author = {Salvador Nieto S{\'{a}}nchez and Evangelos Triantaphyllou and
Donald Kraft},
title = {A feature mining based approach for the classification of text
documents into disjoint classes},
journal = {Information Processing and Management},
year = {2002},
volume = {38},
number = {4},
pages = {583--604},
url = {},
abstract = {This paper proposes a new approach for classifying text documents
into two disjoint classes. The new approach is based on
extracting patterns, in the form of two logical expressions,
which are defined on various features (indexing terms) of the
documents. The pattern extraction is aimed at providing
descriptions (in the form of two logical expressions) of the two
classes of positive and negative examples. This is achieved by
means of a data mining approach, called One Clause At a Time
(OCAT), which is based on mathematical logic. The application of
a logic-based approach to text document classification is
critical when one wishes to be able to justify why a particular
document has been assigned to one class versus the other class.
This situation occurs, for instance, in declassifying documents
that have been previously considered important to national
security and thus are currently being kept as secret. Some
computational experiments have investigated the effectiveness of
the OCAT-based approach and compared it to the well-known vector
space model (VSM). These tests also have investigated finding the
best indexing terms that could be used in making these
classification decisions. The results of these computational
experiments on a sample of 2897 text documents from the TIPSTER
collection indicate that the first approach has many advantages
over the VSM approach for solving this type of text document
classification problem. Moreover, a guided strategy for the
OCAT-based approach is presented for deciding which document one
needs to consider next while building the training example sets.},
}
@inProceedings{Nigam98,
author = {Kamal Nigam and Andrew K. McCallum and Sebastian Thrun and Tom M.
Mitchell},
title = {Learning to classify text from labeled and unlabeled documents},
booktitle = {Proceedings of AAAI-98, 15th Conference of the American
Association for Artificial Intelligence},
publisher = {{AAAI} Press, Menlo Park, {US}},
editor = {},
year = {1998},
pages = {792--799},
address = {Madison, {US}},
note = {An extended version appears as~\cite{Nigam00}},
url = {http://www.cs.cmu.edu/~knigam/papers/emcat-aaai98.ps},
abstract = {In many important text classification problems, acquiring class
labels for training documents is costly, while gathering large
quantities of unlabeled data is cheap. This paper shows that the
accuracy of text classifiers trained with a small number of
labeled documents can be improved by augmenting this small
training set with a large pool of unlabeled documents. We present
a theoretical argument showing that, under common assumptions,
unlabeled data contain information about the target function. We
then introduce an algorithm for learning from labeled and
unlabeled text based on the combination of
Expectation-Maximization with a naive Bayes classifier. The
algorithm first trains a classifier using the available labeled
documents, and probabilistically labels the unlabeled documents;
it then trains a new classifier using the labels for all the
documents, and iterates to convergence. Experimental results,
obtained using text from three different real-world tasks, show
that the use of unlabeled data reduces classification error by up
to 33\%.},
}
@inProceedings{Nigam00a,
author = {Kamal Nigam and Rayid Ghani},
title = {Analyzing the applicability and effectiveness of co-training},
booktitle = {Proceedings of CIKM-00, 9th ACM International Conference on
Information and Knowledge Management},
publisher = {{ACM} Press, New York, {US}},
address = {McLean, {US}},
editor = {Arvin Agah and Jamie Callan and Elke Rundensteiner},
year = {2000},
pages = {86--93},
url = {http://www.cs.cmu.edu/~knigam/papers/cotrain-CIKM00.pdf},
abstract = {Recently there has been significant interest in supervised
learning algorithms that combine labeled and unlabeled data for
text learning tasks. The co-training setting applies to datasets
that have a natural separation of their features into two
disjoint sets. We demonstrate that when learning from labeled and
unlabeled data, algorithms explicitly leveraging a natural
independent split of the features outperform algorithms that do
not. When a natural split does not exist, co-training algorithms
that manufacture a feature split may outperform algorithms not
using a split. These results help explain why co-training
algorithms are both discriminative in nature and robust to the
assumptions of their embedded classifiers.},
}
@article{Nigam00,
author = {Kamal Nigam and Andrew K. McCallum and Sebastian Thrun and Tom M.
Mitchell},
title = {Text Classification from Labeled and Unlabeled Documents using
{EM}},
journal = {Machine Learning},
year = {2000},
number = {2/3},
volume = {39},
pages = {103--134},
url = {http://www.cs.cmu.edu/~knigam/papers/emcat-mlj99.ps},
abstract = {This paper shows that the accuracy of learned text classifiers
can be improved by augmenting a small number of labeled training
documents with a large pool of unlabeled documents. This is
important because in many text classification problems obtaining
training labels is expensive, while large quantities of unlabeled
documents are readily available. We introduce an algorithm for
learning from labeled and unlabeled documents based on the
combination of Expectation-Maximization (EM) and a naive Bayes
classifier. The algorithm first trains a classifier using the
available labeled documents, and probabilistically labels the
unlabeled documents. It then trains a new classifier using the
labels for all the documents, and iterates to convergence. This
basic EM procedure works well when the data conform to the
generative assumptions of the model. However these assumptions
are often violated in practice, and poor performance can result.
We present two extensions to the algorithm that improve
classification accuracy under these conditions: (1) a weighting
factor to modulate the contribution of the unlabeled data, and
(2) the use of multiple mixture components per class.
Experimental results, obtained using text from three different
real-world tasks, show that the use of unlabeled data reduces
classification error by up to 30\%.},
}
@phdThesis{Nigam01,
author = {Kamal Nigam},
title = {Using Unlabeled Data to Improve Text Classification},
school = {Computer Science Department, Carnegie Mellon University},
address = {Pittsburgh, {US}},
year = {2001},
url = {http://www-2.cs.cmu.edu/~knigam/papers/thesis-nigam.pdf},
abstract = {One key difficulty with text classification learning algorithms
is that they require many hand-labeled examples to learn
accurately. This disser- tation demonstrates that supervised
learning algorithms that use a small number of labeled examples
and many inexpensive unlabeled examples can create high-accuracy
text classifiers. By assuming that documents are created by a
parametric generative model, Expectation-Maximization (EM) finds
local maximum a posteriori models and classifiers from all the
data|labeled and unlabeled. These generative models do not
capture all the intricacies of text; however on some domains this
technique substan- tially improves classification accuracy,
especially when labeled data are sparse. Two problems arise from
this basic approach. First, unlabeled data can hurt performance
in domains where the generative modeling assumptions are too
strongly violated. In this case the assumptions can be made more
representative in two ways: by modeling sub-topic class
structure, and by modeling super-topic hierarchical class
relationships. By doing so, model probability and classification
accuracy come into correspondence, allowing unlabeled data to
improve classification performance. The second problem is that
even with a representative model, the improvements given by
unlabeled data do not sufficiently compensate for a paucity of
labeled data. Here, limited labeled data provide EM
initializations that lead to low-probability models. Performance
can be significantly improved by using active learning to select
high-quality initializations, and by using alternatives to EM
that avoid low-probability local maxima.},
}
@inProceedings{Oh00,
author = {Hyo-Jung Oh and Sung Hyon Myaeng and Mann-Ho Lee},
title = {A practical hypertext categorization method using links and
incrementally available class information},
booktitle = {Proceedings of SIGIR-00, 23rd ACM International Conference on
Research and Development in Information Retrieval},
editor = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong},
publisher = {{ACM} Press, New York, {US}},
address = {Athens, {GR}},
year = {2000},
pages = {264--271},
url = {http://www.acm.org/pubs/articles/proceedings/ir/345508/p264-oh/p264-oh.pdf},
abstract = {As WWW grows at an increasing speed, a classifier targeted at
hypertext has become in high demand. While document
categorization is quite a mature, the issue of utilizing
hypertext structure and hyperlinks has been relatively
unexplored. In this paper, we propose a practical method for
enhancing both the speed and the quality of hypertext
categorization using hyperlinks. In comparison against a recently
proposed technique that appears to be the only one of the kind,
we obtained up to 18.5\% of improvement in effectiveness while
reducing the processing time dramatically. We attempt to explain
through experiments what factors contribute to the improvement.},
}
@inProceedings{Ontrup01,
author = {J{\"{o}}rg Ontrup and Helge Ritter},
title = {Text Categorization and Semantic Browsing with Self-Organizing
Maps on Non-{E}uclidean Spaces},
booktitle = {Proceedings of PKDD-01, 5th European Conference on Principles and
Practice of Knowledge Discovery in Databases},
editor = {Luc De Raedt and Arno Siebes},
publisher = {Springer Verlag, Heidelberg, {DE}},
address = {Freiburg, {DE}},
year = {2001},
pages = {338--349},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2168},
url = {http://www.techfak.uni-bielefeld.de/ags/ni/publications/papers/OntrupRitter2001-TCA.pdf},
abstract = {This paper introduces a new type of Self-Organizing Map (SOM) for
Text Categorization and Semantic Browsing. We propose a
``hyperbolic SOM'' (HSOM) based on a regular tesselation of the
hyperbolic plane, which is a non-euclidean space characterized by
constant negative gaussian curvature. This approach is motivated
by the observation that hyperbolic spaces possess a geometry
where the size of a neighborhood around a point increases
exponentially and therefore provides more freedom to map a
complex information space such as language into spatial
relations. These theoretical findings are supported by our
experiments, which show that hyperbolic SOMs can successfully be
applied to text categorization and yield results comparable to
other state-of-the-art methods. Furthermore we demonstrate that
the HSOM is able to map large text collections in a semantically
meaningful way and therefore allows a ``semantic browsing'' of
text databases.},
}
@article{Paijmans98,
author = {Paijmans, Hans},
title = {Text categorization as an information retrieval task},
journal = {The South African Computer Journal},
year = {1999},
pages = {4--15},
volume = {},
number = {21},
url = {},
abstract = {A number of methods for feature reduction and feature selection
in text classification and information retrieval systems are
compared. These include feature sets that are constructed by
Latent Semantic Indexing, `local dictionaries' in the form of the
words that score highest in frequency in positive class examples
and feature sets that are constructed by relevance feedback
strategies such as J.J. Rocchio's (1971) feedback algorithm or
genetic algorithms. Also, different derivations from the normal
recall and precision performance indicators are discussed and
compared. It was found that categorizers consisting of the words
with highest tf.idf values scored best.},
}
@inProceedings{Paliouras99,
author = {Georgios Paliouras and Vangelis Karkaletsis and Constantine D.
Spyropoulos},
title = {Learning rules for large vocabulary word sense disambiguation},
booktitle = {Proceedings of IJCAI-99, 16th International Joint Conference on
Artificial Intelligence},
editor = {Thomas Dean},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
year = {1999},
pages = {674--679},
address = {Stockholm, {SE}},
url = {http://www.iit.demokritos.gr/~paliourg/papers/IJCAI99.ps.gz},
abstract = {Word Sense Disambiguation (WSD) is the process of distinguishing
between different senses of a word. In general, the
disambiguation rules differ for different words. For this reason,
the automatic construction of disambiguation rules is highly
desirable. One way to achieve this aim is by applying machine
learning techniques to training data containing the various
senses of the ambiguous words. In the work presented here, the
decision tree learning algorithm C4.5 is applied on a corpus of
financial news articles. Instead of concentrating on a small set
of ambiguous words, as done in most of the related previous work,
all content words of the examined corpus are disambiguated.
Furthermore, the effectiveness of word sense disambiguation for
different parts of speech (nouns and verbs) is examined
empirically.},
}
@inProceedings{Peng03,
author = {Fuchun Peng and Dale Schuurmans},
title = {Combining naive {B}ayes $n$-gram and language models for text
classification},
booktitle = {Proceedings of ECIR-03, 25th European Conference on Information
Retrieval},
publisher = {Springer Verlag},
editor = {Fabrizio Sebastiani},
address = {Pisa, {IT}},
year = {2003},
pages = {335--350},
url = {http://link.springer.de/link/service/series/0558/papers/2633/26330335.pdf},
abstract = {We augment the naive Bayes model with an n-gram language model to
address two shortcomings of naive Bayes text classifiers. The
chain augmented naive Bayes classifiers we propose have two
advantages over standard naive Bayes classifiers. First, a chain
augmented naive Bayes model relaxes some of the independence
assumptions of naive Bayes--allowing a local Markov chain
dependence in the observed variables--while still permitting
efficient inference and learning. Second, smoothing techniques
from statistical language modeling can be used to recover better
estimates than the Laplace smoothing techniques usually used in
naive Bayes classification. Our experimental results on three
real world data sets show that we achieve substantial
improvements over standard naive Bayes classification, while also
achieving state of the art performance that competes with the
best known methods in these cases.},
}
@inProceedings{Peng03a,
author = {Fuchun Peng and Dale Schuurmans and Shaojun Wang},
title = {Language and Task Independent Text Categorization with Simple
Language Models},
booktitle = {Proceedings of HLT-03, 3rd Human Language Technology Conference},
publisher = {},
editor = {},
address = {Edmonton, {CA}},
year = {2003},
pages = {},
url = {},
abstract = {},
}
@inProceedings{Petasis00,
author = {Georgios Petasis and Alessandro Cucchiarelli and Paola Velardi
and Georgios Paliouras and Vangelis Karkaletsis and Constantine
D. Spyropoulos},
title = {Automatic adaptation of proper noun dictionaries through
cooperation of machine learning and probabilistic methods},
booktitle = {Proceedings of SIGIR-00, 23rd ACM International Conference on
Research and Development in Information Retrieval},
editor = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong},
publisher = {{ACM} Press, New York, {US}},
address = {Athens, {GR}},
year = {2000},
pages = {128--135},
url = {http://www.acm.org/pubs/articles/proceedings/ir/345508/p128-petasis/p128-petasis.pdf},
abstract = {The recognition of Proper Nouns (PNs) is considered an important
task in the area of Information Retrieval and Extraction. However
the high performance of most existing PN classifiers heavily
depends upon the availability of large dictionaries of
domain-specific Proper Nouns, and a certain amount of manual work
for rule writing or manual tagging. Though it is not a heavy
requirement to rely on some existing PN dictionary (often these
resources are available on the web), its coverage of a domain
corpus may be rather low, in absence of manual updating. In this
paper we propose a technique for the automatic updating of a PN
Dictionary through the cooperation of an inductive and a
probabilistic classifier. In our experiments we show that,
whenever an existing PN Dictionary allows the identification of
50\% of the proper nouns within a corpus, our technique allows,
without additional manual effort, the successful recognition of
about 90\% of the remaining 50\%.},
}
@inProceedings{Peters02,
author = {C. Peters and Cornelis H. Koster},
title = {Uncertainty-based Noise Reduction and Term selection in Text
Categorization},
booktitle = {Proceedings of ECIR-02, 24th European Colloquium on Information
Retrieval Research},
editor = {Fabio Crestani and Mark Girolami and Cornelis J. Van Rijsbergen},
year = {2002},
address = {Glasgow, {UK}},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 2291},
pages = {248--267},
url = {http://link.springer.de/link/service/series/0558/papers/2291/22910248.pdf},
abstract = {This paper introduces a new criterium for term selection, which
is based on the notion of Uncertainty. Term selection according
to this criterium is performed by the elimination of noisy terms
on a class-by-class basis, rather than by selecting the most
significant ones. Uncertainty-based term selection (UC) is
compared to a number of other criteria like Information Gain
(IG), simplified chi-square (SX), Term Frequency (TF) and
Document Frequency (DF) in a Text Categorization setting.
Experiments on data sets with different properties
(Reuters-21578, patent abstracts and patent applications) and
with two different algorithms (Winnow and Rocchio) show that
UC-based term selection is not the most aggressive term selection
criterium, but that its effect is quite stable across data sets
and algorithms. This makes it a good candidate for a general
"install-and-forget" term selection mechanism. We also describe
and evaluate a hybrid Term Selection technique, first applying UC
to eliminate noisy terms and then using another criterium to
select the best terms.},
}
@inProceedings{Ragas98,
author = {Hein Ragas and Cornelis H. Koster},
title = {Four text classification algorithms compared on a {D}utch corpus},
booktitle = {Proceedings of SIGIR-98, 21st ACM International Conference on
Research and Development in Information Retrieval},
editor = {W. Bruce Croft and Alistair Moffat and Cornelis J. Van Rijsbergen
and Ross Wilkinson and Justin Zobel},
publisher = {{ACM} Press, New York, {US}},
year = {1998},
address = {Melbourne, {AU}},
pages = {369--370},
url = {http://www.acm.org/pubs/articles/proceedings/ir/290941/p369-ragas/p369-ragas.pdf},
abstract = {We describe an experiment in applying text classification
algorithms to Dutch texts. Four well known learning algorithms:
Rocchio's algorithm (W.W. Cohen and Y. Singer, 1995), the Simple
Bayesian Classifier (SEC) (R.O. Duda and P.E. Hart, 1973), the
Sleeping Experts (SE) and Winnow (I. Dagan et al., 1997) were
implemented. They were tested on a corpus of articles from the
Dutch newspaper NRC, and pre-classified into four categories. The
algorithms are compared on learning speed and error rate. We also
investigated the effect of discarding terms, using either a
dynamic stoplist or the Winnow heuristic.},
}
@inProceedings{Raskutti01,
author = {Bhavani Raskutti and Herman Ferr{\'{a}} and Adam Kowalczyk},
title = {Second Order Features for Maximising Text Classification
Performance},
booktitle = {Proceedings of ECML-01, 12th European Conference on Machine
Learning},
editor = {Luc De Raedt and Peter A. Flach},
year = {2001},
url = {http://link.springer.de/link/service/series/0558/papers/2167/21670454.pdf},
abstract = {The paper demonstrates that the addition of automatically
selected word-pairs substantially increases the accuracy of text
classification which is contrary to most previously reported
research. The word-pairs are selected automatically using a
technique based on frequencies of n-grams (sequences of
characters), which takes into account both the frequencies of
word-pairs as well as the context in which they occur. These
improvements are reported for two different classifiers, support
vector machines (SVM) and k-nearest neighbours (kNN), and two
different text corpora. For the first of them, a collection of
articles from PC Week magazine, the addition of word-pairs
increases micro-averaged breakeven accuracy by more than 6\%
point from a baseline accuracy (without pairs) of around 40\%.
For second one, the standard Reuters benchmark, SVM classifier
using augmentation with pairs outperforms all previously reported
results.},
}
@inProceedings{Rau91,
author = {Lisa F. Rau and Paul S. Jacobs},
title = {Creating segmented databases from free text for text retrieval},
booktitle = {Proceedings of SIGIR-91, 14th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Abraham Bookstein and Yves Chiaramella and Gerard Salton and
Vijay V. Raghavan},
publisher = {{ACM} Press, New York, {US}},
address = {Chicago, {US}},
pages = {337--346},
year = {1991},
url = {http://www.acm.org/pubs/articles/proceedings/ir/122860/p337-rau/p337-rau.pdf},
abstract = {Indexing text for accurate retrieval is a difficult and important
problem. On-line information services generally depend on keyword
indices rather than other methods of retrieval, because of the
practical features of keywords for storage, dissemination, and
browsing as well as for retrieval. However, these methods of
indexing have two major drawbacks: First, they must be
laboriously assigned by human indexers. Second, they are
inaccurate, because of mistakes made by these indexers as well as
the difficulties users have in choosing keywords for their
queries, and the ambiguity a keyword may have. Current natural
language text processing (NLP) methods help to overcome these
problems. Such methods can provide automatic indexing and keyword
assignment capabilities that are at least as accurate as human
indexers in many applications. In addition, NLP systems can
increase the information contained in keyword fields by
separating keywords into segments, or distinct fields that
capture certain discriminating content or relations among
keywords. This paper reports on a system that uses natural
language text processing to derive keywords from free text news
stories, separate these keywords into segments, and automatically
build a segmented database. The system is used as part of a
commercial news clipping and retrieval product. Preliminary
results show improved accuracy, as well as reduced cost,
resulting from these automated techniques.},
}
@inProceedings{Rennie99,
author = {Jason Rennie and Andrew Kachites McCallum},
title = {Using reinforcement learning to spider the {W}eb efficiently},
booktitle = {Proceedings of ICML-99, 16th International Conference on Machine
Learning},
editor = {Ivan Bratko and Saso Dzeroski},
year = {1999},
address = {Bled, {SL}},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
pages = {335--343},
url = {http://www.watson.org/~jrennie/papers/icml99.ps.gz},
abstract = {Consider the task of exploring the Web in order to find pages of
a particular kind or on a particular topic. This task arises in
the construction of search engines and Web knowledge bases. The
paper argues that the creation of efficient Web spiders is best
framed and solved by reinforcement learning, a branch of machine
learning that concerns itself with optimal sequential decision
making. One strength of reinforcement learning is that it
provides a formalism for measuring the utility of actions that
give benefit only in the future. We present an algorithm for
learning a value function that maps hyperlinks to future
discounted reward using a naive Bayes text classifier.
Experiments on two real-world spidering tasks show a three-fold
improvement in spidering efficiency over traditional
breadth-first search, and up to a two-fold improvement over
reinforcement learning with immediate reward only.},
}
@inProceedings{Rennie03,
author = {Jason Rennie and Lawrence Shih and Jaime Teevan and David Karger},
title = {Tackling the Poor Assumptions of Naive Bayes Text Classifiers},
booktitle = {Proceedings of ICML-03, 20th International Conference on Machine
Learning},
editor = {},
year = {2003},
address = {Washington, {DC}},
pages = {},
publisher = {Morgan Kaufmann Publishers, San Francisco, {US}},
url = {},
abstract = {},
}
@article{RibeiroNeto01,
author = {Berthier Ribeiro-Neto and Alberto H.F. Laender and Luciano R. {De
Lima}},
title = {An Experimental Study in Automatically Categorizing Medical
Documents},
journal = {Journal of the American Society for Information Science and
Technology},
year = {2001},
number = {5},
pages = {391--401},
volume = {52},
url = {http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76511157&PLACEBO=IE.pdf},
abstract = {In this article, we evaluate the retrieval performance of an
algorithm that automatically categorizes medical documents. The
categorization, which consists in assigning an International Code
of Disease (ICD) to the medical document under examination, is
based on well-known information retrieval techniques. The
algorithm, which we proposed, operates in a fully automatic mode
and requires no supervision or training data. Using a database of
20,569 documents, we verify that the algorithm attains levels of
average precision in the 70-80\% range for category coding and in
the 60-70\% range for subcategory coding. We also carefully
analyze the case of those documents whose categorization is not
in accordance with the one provided by the human specialists. The
vast majority of them represent cases that can only be fully
categorized with the assistance of a human subject (because, for
instance, they require specific knowledge of a given pathology).
For a slim fraction of all documents (0.77\% for category coding
and 1.4\% for subcategory coding), the algorithm makes
assignments that are clearly incorrect. However, this fraction
corresponds to only one-fourth of the mistakes made by the human
specialists.},
}
@inProceedings{Riloff93,
author = {Ellen Riloff},
title = {Using Cases to Represent Context for Text Classification},
booktitle = {Proceedings of CIKM-93, 2nd International Conference on
Information and Knowledge Management},
publisher = {{ACM} Press, New York, {US}},
editor = {Bharat Bhargava and Timothy Finin and Yelena Yesha},
year = {1993},
address = {New York, {US}},
pages = {105--113},
url = {http://www.cs.utah.edu/~riloff/psfiles/cikm93-w-addend.ps},
abstract = {Research on text classification has typically focused on keyword
searches and statistical techniques. Keywords alone cannot always
distinguish the relevant from the irrelevant texts and some
relevant texts do not contain any reliable keywords at all. Our
approach to text classifkation uses case-based reasoning to
represent natural language contexts that can be used to classify
texts with extremely high precision. The case base of natural
language contexts is acquired automatically during sentence
analysis using a training corpus of texts and their correct
relevancy classifications. A text is represented as a set of
cases and we classify a text as relevant if any of its cases is
deemed to be relevant. We rely on the statistical properties of
the case base to determine whether similar cases are highly
correlated with relevance for the domain. Experiments with the
MUC corpus suggest that case-based text classification can
achieve very high levels of precision and outperforms our
previous algorithms based on relevancy signatures.},
}
@phdThesis{Riloff94a,
author = {Ellen Riloff},
title = {Information Extraction as a Basis for Portable Text
Classification Systems},
school = {Department of Computer Science, University of Massachusetts},
address = {Amherst, {US}},
year = {1994},
url = {http://www.cs.utah.edu/~riloff/psfiles/single-thesis.ps},
abstract = {Knowledge-based natural language processing systems have achieved
good success with many tasks, but they often require many
person-months of effort to build an appropriate knowledge base.
As a result, they are not portable across domains. This
knowledge-engineering bottleneck must be addressed before
knowledge-based systems will be practical for real-world
applications. This dissertation addresses the
knowledge-engineering bottleneck for a natural language
processing task called ``information extraction''. A system
called AutoSlog is presented which automatically constructs
dictionaries for information extraction, given an appropriate
training corpus. In the domain of terrorism, AutoSlog created a
dictionary using a training corpus and five person-hours of
effort that achieved 98\% of the performance of a hand-crafted
dictionary that took approximately 1500 person-hours to build.
This dissertation also describes three algorithms that use
information extraction to support high-precision text
classification. As more information becomes available on-line,
intelligent information retrieval will be crucial in order to
navigate the information highway efficiently and effectively. The
approach presented here represents a compromise between
keyword-based techniques and in-depth natural language
processing. The text classification algorithms classify texts
with high accuracy by using an underlying information extraction
system to represent linguistic phrases and contexts. Experiments
in the terrorism domain suggest that increasing the amount of
linguistic context can improve performance. Both AutoSlog and the
text classification algorithms are evaluated in three domains:
terrorism, joint ventures, and microelectronics. An important
aspect of this dissertation is that AutoSlog and the text
classification systems can be easily ported across domains.},
}
@article{Riloff94,
author = {Ellen Riloff and Wendy Lehnert},
title = {Information extraction as a basis for high-precision text
classification},
journal = {{ACM} Transactions on Information Systems},
year = {1994},
number = {3},
volume = {12},
pages = {296--333},
url = {http://www.cs.utah.edu/~riloff/psfiles/single-acm.ps},
abstract = {We describe an approach to text classification that represents a
compromise between traditional word-based techniques and in-depth
natural language processing. Our approach uses a natural language
processing task called information extraction as a basis for
high-precision text classification. We present three algorithms
that use varying amounts of extracted information to classify
texts. The relevancy signatures algorithm uses linguistic
phrases, the augmented relevancy signatures algorithm uses
phrases and local context, and the case-based text classification
algorithm uses larger pieces of context. Relevant phrases and
contexts are acquired automatically using a training corpus. We
evaluate the algorithms on the basis of two test sets from the
MUC-4 corpus. All three algorithms achieved high precision on
both test sets, with the augmented relevancy signatures algorithm
and the case-based algorithm reaching 100\% precision with over
60\% recall on one set. In addition, we compare the algorithms on
a larger collection of 1700 texts and describe an automated
method for empirically deriving appropriate threshold values. The
results suggest that information extraction techniques can
support high-precision text classification and, in general, using
more extracted information improves performance. As a practical
matter, we also explain how the text classification system can be
easily ported across domains.},
}
@inProceedings{Riloff95,
author = {Ellen Riloff},
title = {Little Words Can Make a Big Difference for Text Classification},
booktitle = {Proceedings of SIGIR-95, 18th ACM International Conference on
Research and Development in Information Retrieval},
editor = {Edward A. Fox and Peter Ingwersen and Raya Fidel},
publisher = {{ACM} Press, New York, {US}},
year = {1995},
address = {Seattle, {US}},
pages = {130--136},
url = {http://www.cs.utah.edu/~riloff/psfiles/sigir95.ps},
abstract = {Most information retrieval systems use stopword lists and
stemming algorithms. However, we have found that recognizing
singular and plural nouns, verb forms, negation, and prepositions
can produce dramatically different text classification results.
We present results from text classification experiments that
compare relevancy signatures, which use local linguistic context,
with corresponding indexing terms that do not. In two different
domains, relevancy signatures produced better results than the
simple indexing terms. These experiments suggest that stopword
lists and stemming algorithms may remove or conflate many words
that could be used to create more effective indexing terms.},
}
@inProceedings{Riloff96,
author = {Ellen Riloff},
title = {Using Learned Extraction Patterns for Text Classification},
booktitle = {Connectionist, statistical, and symbolic approaches to learning
for natural language processing},
editor = {Stefan Wermter and Ellen Riloff and Gabriele Scheler},
pages = {275--289},
year = {1996},
publisher = {Springer Verlag, Heidelberg, {DE}},
note = {Published in the ``Lecture Notes in Computer Science'' series,
number 1040},
url = {http://www.cs.utah.edu/~riloff/psfiles/ijcai-book-chapter.ps},
abstract = {A major knowledge-engineering bottleneck for information
extraction systems is the process of constructing an appropriate
dictionary of extraction patterns. AutoSlog is a dictionary
construction system that has been shown to substantially reduce
the time required for knowledge engineering by learning
extraction patterns automatically. However, an open question was
whether these extraction patterns were useful for tasks other
than information extraction. The author describes a series of
experiments that show how the extraction patterns learned by
AutoSlog can be used for text classification. Three dictionaries
produced by AutoSlog for different domains performed well in the
author`s text classification experiments.},
}
@inProceedings{Riloff92,
author = {Ellen Riloff and Wendy Lehnert},
title = {Classifying Texts Using Relevancy Signatures},
booktitle = {Proceedings of AAAI-92, 10th Conference of the American
Association for Artificial Intelligence},
publisher = {{AAAI} Press, Menlo Park, {US}},
editor = {},
year = {1998},
pages = {329--334},
address = {San Jose, {US}},
url = {},
abstract = {},
}
@inCollection{Riloff99,
author = {Ellen Riloff and Jeffrey Lorenzen},
title = {Extraction-based Text Categorization: Generating Domain-specific
Role Relationships},
booktitle = {Natural language information retrieval},
editor = {Tomek Strzalkowski},
year = {1999},
pages = {167--196},
publisher = {Kluwer Academic Publishers},
address = {Dordrecht, {NL}},
url = {http://www.cs.utah.edu/~riloff/psfiles/nlp-ir-chapter.ps},
abstract = {In previous work, we developed several algorithms that use
information extraction techniques to achieve high-precision text
categorization. The relevancy signatures algorithm classifies
texts using extraction patterns, and the augmented relevancy
signatures algorithm classifies texts using extraction patterns
and semantic features associated with role fillers (Riloff and
Lehnert, 1994). These algorithms relied on hand-coded training
data, including annotated texts and a semantic dictionary. In
this chapter, we describe two advances that significantly improve
the practicality of our approach. First, we explain how the
extraction patterns can be generated automatically using only
preclassified texts as input. Second, we present the
word-augmented relevancy signatures algorithm that uses lexical
items to represent domain-specific role relationships instead of
semantic features. Using these techniques, we can automatically
build text categorization systems that benefit from
domain-specific natural language processing.},
}
@article{Robertson84,
author = {Stephen E. Robertson and P. Harding},
title = {Probabilistic automatic indexing by learning from human indexers},
year = {1984},
journal = {Journal of Documentation},
volume = {40},
number = {4},
pages = {264--270},
url = {},
abstract = {},
}
@inProceedings{Roth98,
author = {Dan Roth},
title = {Learning to resolve natural language ambigui