%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%                                                                      %
%                                                                      %
%          A Bibliography on Automatic Text Categorization             %
%                                                                      %
%                   compiled and maintained by                         %
%                                                                      %
%                      Fabrizio Sebastiani                             %
%              Istituto di Elaborazione dell'Informazione              %
%                  Consiglio Nazionale delle Ricerche                  %
%             Via Giuseppe Moruzzi, 1 - 56124 Pisa, Italy              %
%                 http://faure.iei.pi.cnr.it/~fabrizio/                %
%                                                                      %
%                                                                      %
% This is a bibliography, in BibTeX format, on automatic text          %
% categorization (ATC), defined as the activity of automatically       %
% building, by means of machine learning techniques, automatic text    %
% classifiers, i.e. systems capable of assigning to a text             %
% document one or more thematic categories from a predefined set.      %
%                                                                      %
% This bibliography resides at                                         %
%        http://faure.iei.pi.cnr.it/~fabrizio/ATCbibliography.bib      %
% Everyone is welcome to download it as a whole and distribute it,     %
% provided that it is distributed untouched.                           %
%                                                                      %
% Everyone is also welcome to let me know either additional            %
% references or corrections and additions (e.g. URLs, where            %
% they are not already present) to the existing ones.                  %
% In general, only references specific to ATC are considered           %
% pertinent to this bibliography; in particular, references that       %
% #are# considered pertinent are:                                      %
%                                                                      %
% * publications that discuss novel ATC methods, novel                 %
%   experimentation of previously known methods, or resources for      %
%   ATC experimentation;                                               %
%                                                                      %
% * publications that discuss applications of ATC (e.g.                %
%   automated indexing for Boolean IR systems, filtering, etc.).       %
%                                                                      %
% References that are #not# considered pertinent are:                  %
%                                                                      %
% * publications that discuss techniques in principle useful for       %
%   ATC (e.g. machine learning techniques, information retrieval       %
%   techniques) but do not explicitly discuss their application        %
%   to ATC;                                                            %
%                                                                      %
% * publications thet discuss related topics sometimes confused with   %
%   ATC; these include, in particular, text clustering (i.e. text      %
%   classification by unsupervised learning) and text indexing;        %
%                                                                      %
% * technical reports and workshop papers. Only papers that have       %
%   been the object of formal publication (i.e. conferences and        %
%   journals) are to be included in the bibliography, so as to avoid   %
%   its explosion and the inclusion of material bound to obsolescence. %
%                                                                      %
% Concerning URLs from which to download on-line copies of the         %
% papers, where possible I have included URLs with unrestricted        %
% access (e.g. home pages of authors). When such URLs were not         %
% available, sometimes a URL with restricted access (e.g. the          %
% ACM Digital Library or the IEEE Computing Society Digital            %
% Library, which are accessible to subscribers only) is indicated.     %
% When this is the case, if you know of a URL with unrestricted access %
% from which the paper is also available, please let me know and I     %
% will substitute the link.                                            %
%                                                                      %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@inProceedings{Adam02,
   author       = {Chai K. Adam and Hwee T. Ng and Hai L. Chieu},
   title        = {Bayesian Online Classifiers for Text Classification and Filtering},
   booktitle    = {Proceedings of SIGIR-02, 25th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng 
                   and Kalervo J{\"{a}}rvelin},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Tampere, {FI}},
   year         = {2002},
   pages        = {97--104},
   url          = {http://doi.acm.org/10.1145/564376.564395},
   abstract     = {This paper explores the use of Bayesian online classifiers to 
                   classify text documents. Empirical results indicate that these 
                   classifiers are comparable with the best text classification 
                   systems. Furthermore, the online approach offers the advantage of 
                   continuous learning in the batch-adaptive text filtering task.},
}
@inProceedings{Aggarwal99,
   author       = {Charu C. Aggarwal and Stephen C. Gates and Philip S. Yu},
   title        = {On the merits of building categorization systems by supervised 
                   clustering},
   booktitle    = {Proceedings of EDBT-00, 7th International Conference on Extending 
                   Database Technology},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1999},
   address      = {Konstanz, {DE}},
   pages        = {352--356},
   url          = {http://doi.acm.org/10.1145/312129.312279},
   abstract     = {This paper investigates the use of supervised clustering in order 
                   to create sets of categories for classification of documents. We 
                   use information from a pre-existing taxonomy in order to 
                   supervise the creation of a set of related clusters, though with 
                   some freedom in defining and creating the classes. We show that 
                   the advantage of using supervised clustering is that it is 
                   possible to have some control over the range of subjects that one 
                   would like the categorization system to address, but with a 
                   precise mathematical definition of each category. We then 
                   categorize documents using this a priori knowledge of the 
                   definition of each category. We also discuss a new technique to 
                   help the classifier distinguish better among closely related 
                   clusters. Finally, we show empirically that this categorization 
                   system utilizing a machine-derived taxonomy performs as well as a 
                   manual categorization process, but at a far lower cost.},
}
@inProceedings{Agrawal00,
   author       = {Rakesh Agrawal and Roberto J. Bayardo and Ramakrishnan Srikant},
   title        = {{\sc Athena}: Mining-based Interactive Management of Text 
                   Databases},
   booktitle    = {Proceedings of EDBT-00, 7th International Conference on Extending 
                   Database Technulogy},
   editor       = {Carlo Zaniolo and Peter C. Lockemann and Marc H. Scholl and 
                   Torsten Grust},
   year         = {2000},
   address      = {Konstanz, {DE}},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1777},
   pages        = {365--379},
   url          = {http://www.almaden.ibm.com/cs/people/ragrawal/papers/athena.ps},
   abstract     = {We describe Athena: a system for creating, exploiting, and 
                   maintaining a hierarchical arrangement of textual documents 
                   through interactive mining-based operations. Requirements of any 
                   such system include speed and minimal end-user effort. Athena 
                   satisfies these requirements through linear-time classification 
                   and clustering engines which are applied interactively to speed 
                   the development of accurate models. Naive Bayes classifiers are 
                   recognized to be among the best for classifying text. We show 
                   that our specialization of the Naive Bayes classifier is 
                   considerably more accurate (7 to 29\% absolute increase in 
                   accuracy) than a standard implementation. Our enhancements 
                   include using Lidstone's law of succession instead of Laplace's 
                   law, under-weighting long documents, and over-weighting author 
                   and subject. We also present a new interactive clustering 
                   algorithm, C-Evolve, for topic discovery. C-Evolve first finds 
                   highly accurate cluster digests (partial clusters), gets user 
                   feedback to merge and correct these digests, and then uses the 
                   classification algorithm to complete the partitioning of the 
                   data. By allowing this interactivity in the clustering process, 
                   C-Evolve achieves considerably higher clustering accuracy (10 to 
                   20\% absolute increase in our experiments) than the popular 
                   K-Means and agglomerative clustering methods.},
}
@inProceedings{Agrawal01,
   author       = {Rakesh Agrawal and Ramakrishnan Srikant},
   title        = {On integrating catalogs},
   booktitle    = {Proceedings of WWW-01, 10th International Conference on the World 
                   Wide Web},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {},
   year         = {2001},
   address      = {Hong Kong, {CN}},
   pages        = {603--612},
   url          = {http://doi.acm.org/10.1145/371920.372163},
   abstract     = {We address the problem of integrating documents from different 
                   sources into a master catalog. This problem is pervasive in web 
                   marketplaces and portals. Current technology for automating this 
                   process consists of building a classifier that uses the 
                   categorization of documents in the master catalog to construct a 
                   model for predicting the category of unknown documents. Our key 
                   insight is that many of the data sources have their own 
                   categorization, and classification accuracy can be improved by 
                   factoring in the implicit information in these source 
                   categorizations. We show how a Naive Bayes classification can be 
                   enhanced to incorporate the similarity information present in 
                   source catalogs. Our analysis and empirical evaluation show 
                   substantial improvement in the accuracy of catalog integration.},
}
@inProceedings{Aizawa00,
   author       = {Akiko Aizawa},
   title        = {The feature quantity: an information-theoretic perspective of 
                   tfidf-like measures},
   booktitle    = {Proceedings of SIGIR-00, 23rd ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Athens, {GR}},
   year         = {2000},
   pages        = {104--111},
   url          = {http://doi.acm.org/10.1145/345508.345556},
   abstract     = {The feature quantity, a quantitative representation of 
                   specificity introduced in this paper, is based on an information 
                   theoretic perspective of co-occurrence events between terms and 
                   documents. Mathematically, the feature quantity is defined as a 
                   product of probabillty and information, and maintains a good 
                   correspondence with the tfidf-like measures popularly used in 
                   today's IR systems. In this paper, we present a formal 
                   description of the feature quantity, as well as some illustrative 
                   examples of applying such a quantity to different types of 
                   information retrieval tasks: representative term selection and 
                   text categorization.},
}
@inProceedings{Aizawa01,
   author       = {Akiko Aizawa},
   title        = {Linguistic Techniques to Improve the Performance of Automatic 
                   Text Categorization},
   booktitle    = {Proceedings of NLPRS-01, 6th Natural Language Processing Pacific 
                   Rim Symposium},
   editor       = {},
   publisher    = {},
   address      = {Tokyo, {JP}},
   year         = {2001},
   pages        = {307--314},
   url          = {http://www.afnlp.org/nlprs2001/pdf/0079-01.pdf},
   abstract     = {This paper presents a method for incorporating natural language 
                   processing into existing text categorization procedures. Three 
                   aspects are considered in the investigation: (i) a method for 
                   weighting terms based on the concept of a probability weighted 
                   amount of information, (ii) estimation of term occurrence 
                   probabilities using a probabilistic language model, and (iii) 
                   automatic extraction of terms based on POS tags automatically 
                   generated by a morphological analyzer. The effects of these 
                   considerations are examined in the experiments using 
                   Reuters-21578 and NTCIR-J1 standard test collections.},
}
@inProceedings{Alias02,
   author       = {Francesc Al{\'i}as and Ignasi Iriondo and Pere Barnola},
   title        = {Multi-domain text classification for unit selection 
                   text-to-speech synthesis},
   booktitle    = {Proceedings of ICPhS-03, 15th International Congress on Phonetic 
                   Sciences},
   address      = {Barcelona, ES},
   editor       = {},
   publisher    = {},
   year         = {2003},
   pages        = {},
   url          = {},
   abstract     = {},
}
@inProceedings{AlKofahi01,
   author       = {Khalid Al-Kofahi and Alex Tyrrell and Arun Vachher and Tim 
                   Travers and Peter Jackson},
   title        = {Combining Multiple Classifiers for Text Categorization},
   booktitle    = {Proceedings of CIKM-01, 10th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {Henrique Paques and Ling Liu and David Grossman},
   year         = {2001},
   address      = {Atlanta, {US}},
   pages        = {97--104},
   url          = {http://doi.acm.org/10.1145/502585.502603},
   abstract     = {A major problem facing online information services is how to 
                   index and supplement large document collections with respect to a 
                   rich set of categories. We focus upon the routing of case law 
                   summaries to various secondary law volumes in which they should 
                   be cited. Given the large number (> 13,000) of closely related 
                   categories, this is a challenging task that is unlikely to 
                   succumb to a single algorithmic solution. Our fully implemented 
                   and recently deployed system shows that a superior classification 
                   engine for this task can be constructed from a combination of 
                   classifiers. The multi-classifier approach helps us leverage all 
                   the relevant textual features and meta data, and appears to 
                   generalize to related classification tasks.},
}
@inProceedings{Amati96,
   author       = {Gianni Amati and Daniela D'Aloisi and Vittorio Giannini and 
                   Flavio Ubaldini},
   title        = {An Integrated System for Filtering News and Managing Distributed 
                   Data},
   booktitle    = {Proceedings of PAKM-96, 1st International Conference on Practical 
                   Aspects of Knowledge Management},
   editor       = {},
   publisher    = {},
   year         = {1996},
   pages        = {},
   note         = {An extended version appears as~\cite{Amati97b}},
   address      = {Basel, {CH}},
   url          = {http://airone.fub.it:8080/projects/pakm96.ps},
   abstract     = {With the development and diffusion of the Internet worldwide 
                   connection, a large amount of information can be delivered to the 
                   users. To avoid their being overflowed by the incoming data, 
                   methods of information filtering are required. Thus, there is the 
                   problem of determining what information is relevant to the user 
                   and how this decision can be taken by a supporting system. 
                   Parametric and qualitative descriptors of user's interest must be 
                   generated. This paper presents two approaches. The first concerns 
                   an information filtering system based on an adaptation of the 
                   generalized probabilistic model of information retrieval. The 
                   user profile is a vector of weighted terms which are learned from 
                   the relevance assessment values given by the user on the training 
                   set. Positive terms are considered relevant to the informative 
                   need of the user, negative ones irrelevant. The relevance values 
                   are interpreted as subjective probabilities and hence are mapped 
                   into the real interval [0; 1]. ProFile is a filtering system for 
                   the netnews which uses this model with a scale of 11 predefined 
                   values of relevance. ProFile allows the user to update on-line 
                   his profile and to check the discrepancy between his assessment 
                   and the prediction of relevance of the system. The second 
                   concerns the InfoAgent, a system for supporting users in 
                   retrieving data in distributed and heterogeneous archives and 
                   repositories. The architecture is based on the metaphor of the 
                   software agents and incorporates innovative hints from other 
                   fields: distributed architectures, relevance feedback and active 
                   interfaces. The system has a cooperative and supportive role: it 
                   understands the user's needs and learns from his behavior. Its 
                   aim is to disengage the user from learning complex tools and from 
                   performing tedious and repetitive actions.},
}
@inProceedings{Amati97,
   author       = {Gianni Amati and Fabio Crestani and Flavio Ubaldini},
   title        = {A learning system for selective dissemination of information},
   booktitle    = {Proceedings of IJCAI-97, 15th International Joint Conference on 
                   Artificial Intelligence},
   editor       = {Martha E. Pollack},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   year         = {1997},
   pages        = {764--769},
   address      = {Nagoya, {JP}},
   url          = {http://www.cs.strath.ac.uk/~fabioc/papers/97-ijcai.pdf},
   abstract     = {New methods and new systems are needed to filter or to 
                   selectively distribute the increasing volume of electronic 
                   information being produced nowadays. An effective information 
                   filtering system is one that provides the exact information that 
                   fulfills a user's interest with the minimum effort by the user to 
                   describe it. Such a system will have to be adaptive to the user 
                   changing interest. In this paper we present a learning system for 
                   information filtering and selective information dissemination. 
                   The learning algorithm is described and the effectiveness of the 
                   system is evaluated in a true information filtering style.},
}
@inProceedings{Amati97a,
   author       = {Gianni Amati and Fabio Crestani and Flavio Ubaldini and Stefano 
                   De Nardis},
   title        = {Probabilistic Learning for Information Filtering},
   booktitle    = {Proceedings of RIAO-97, 1st International Conference ``Recherche 
                   d'Information Assistee par Ordinateur''},
   editor       = {Luc Devroye and Claude Chrisment},
   address      = {Montreal, {CA}},
   year         = {1997},
   pages        = {513--530},
   note         = {An extended version appears as~\cite{Amati99}},
   url          = {http://www.cs.strath.ac.uk/~fabioc/papers/97-riao.pdf},
   abstract     = {In this paper we describe and evaluate a learning model for 
                   information filtering which is an adaptation of the generalised 
                   probabilistic model of Information Retrieval. The model is based 
                   on the concept of ``uncertainty sampling'', a technique that 
                   allows for relevance feedback both on relevant and non relevant 
                   documents. The proposed learning model is the core of a prototype 
                   information filtering system called ProFile.},
}
@article{Amati97b,
   author       = {Gianni Amati and Daniela D'Aloisi and Vittorio Giannini and 
                   Flavio Ubaldini},
   title        = {A Framework for Filtering News and Managing Distributed Data},
   journal      = {Journal of Universal Computer Science},
   year         = {1997},
   number       = {8},
   volume       = {3},
   pages        = {1007--1021},
   url          = {http://www.jucs.org/jucs_3_8/a_framework_for_filtering},
   abstract     = {With the development and diffusion of the Internet worldwide 
                   connection, a large amount of information is available to the 
                   users. Methods of information filtering and fetching are then 
                   required. This paper presents two approaches. The first concerns 
                   the information filtering system ProFile based on an adaptation 
                   of the generalized probabilistic model of information retrieval. 
                   ProFile filters the netnews and uses a scale of 11 predefined 
                   values of relevance. ProFile allows the user to update on-line 
                   the profile and to check the discrepancy between the assessment 
                   and the prediction of relevance of the system. The second 
                   concerns ABIS, an intelligent agent for supporting users in 
                   filtering data from distributed and heterogeneous archives and 
                   repositories. ABIS minimizes user's effort in selecting the huge 
                   amount of available documents. The filtering engine memorizes 
                   both user preferences and past situations. ABIS compares 
                   documents with the past situations and finds the similarity 
                   scores on the basis of a memory-based reasoning approach.},
}
@article{Amati99,
   author       = {Gianni Amati and Fabio Crestani},
   title        = {Probabilistic learning for selective dissemination of information},
   journal      = {Information Processing and Management},
   pages        = {633--654},
   year         = {1999},
   number       = {5},
   volume       = {35},
   url          = {http://www.cs.strath.ac.uk/~fabioc/papers/99-ipem.pdf},
   abstract     = {New methods and new systems are needed to filter or to 
                   selectively distribute the increasing volume of electronic 
                   information being produced nowadays. An effective information 
                   filtering system is one that provides the exact information that 
                   fulfills user's interests with the minimum effort by the user to 
                   describe it. Such a system will have to be adaptive to the user 
                   changing interest. In this paper we describe and evaluate a 
                   learning model for information filtering which is an adaptation 
                   of the generalized probabilistic model of Information Retrieval. 
                   The model is based on the concept of `uncertainty sampling', a 
                   technique that allows for relevance feedback both on relevant and 
                   nonrelevant documents. The proposed learning model is the core of 
                   a prototype information filtering system called ProFile.},
}
@inProceedings{Androutsopoulos00,
   author       = {Ion Androutsopoulos and John Koutsias and Konstandinos V. 
                   Chandrinos and Constantine D. Spyropoulos},
   title        = {An experimental comparison of naive {B}ayesian and keyword-based 
                   anti-spam filtering with personal e-mail messages},
   booktitle    = {Proceedings of SIGIR-00, 23rd ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Athens, {GR}},
   year         = {2000},
   pages        = {160--167},
   url          = {http://doi.acm.org/10.1145/345508.345569},
   abstract     = {The growing problem of unsolicited bulk e-mail, also known as 
                   ``spam'', has generated a need for reliable anti-spam e-mail 
                   filters. Filters of this type have so far been based mostly on 
                   manually constructed keyword patterns. An alternative approach 
                   has recently been proposed, whereby a Naive Bayesian classifier 
                   is trained automatically to detect spam messages. We test this 
                   approach on a large collection of personal e-mail messages, which 
                   we make publicly available in "encrypted" form contributing 
                   towards standard benchmarks. We introduce appropriate 
                   cost-sensitive measures, investigating at the same time the 
                   effect of attribute-set size, training-corpus size, 
                   lemmatization, and stop lists, issues that have not been explored 
                   in previous experiments. Finally, the Naive Bayesian filter is 
                   compared, in terms of performance, to a filter that uses keyword 
                   patterns, and which is part of a widely used e-mail reader.},
}
@article{Appiani01,
   author       = {Enrico Appiani and Francesca Cesarini and Annamaria Colla and 
                   Massimiliano Diligenti and Marco Gori and Simone Marinai and 
                   Giovanni Soda},
   title        = {Automatic document classification and indexing in high-volume 
                   applications},
   journal      = {International Journal on Document Analysis and Recognition},
   year         = {2001},
   number       = {2},
   volume       = {4},
   pages        = {69--83},
   url          = {http://link.springer-ny.com/link/service/journals/10032/papers/1004002/10040069.pdf},
   abstract     = {In this paper a system for analysis and automatic indexing of 
                   imaged documents for high-volume applications is described. This 
                   system, named STRETCH (STorage and RETrieval by Content of imaged 
                   documents), is based on an Archiving and Retrieval Engine, which 
                   overcomes the bottleneck of document profiling bypassing some 
                   limitations of existing pre-defined indexing schemes. The engine 
                   exploits a structured document representation and can activate 
                   appropriate methods to characterise and automatically index 
                   heterogeneous documents with variable layout. The originality of 
                   STRETCH lies principally in the possibility for unskilled users 
                   to define the indexes relevant to the document domains of their 
                   interest by simply presenting visual examples and applying 
                   reliable automatic information extraction methods (document 
                   classification, flexible reading strategies) to index the 
                   documents automatically, thus creating archives as desired. 
                   STRETCH offers ease of use and application programming and the 
                   ability to dynamically adapt to new types of documents. The 
                   system has been tested in two applications in particular, one 
                   concerning passive invoices and the other bank documents. In 
                   these applications, several classes of documents are involved. 
                   The indexing strategy first automatically classifies the 
                   document, thus avoiding pre-sorting, then locates and reads the 
                   information pertaining to the specific document class. 
                   Experimental results are encouraging overall; in particular, 
                   document classification results fulfill the requirements of 
                   high-volume application. Integration into production lines is 
                   under execution.},
}
@article{Apte94,
   author       = {Apt\'{e}, Chi Danand and Damerau, Fred J. and Weiss, Sholom M.},
   title        = {Automated learning of decision rules for text categorization},
   journal      = {{ACM} Transactions on Information Systems},
   year         = {1994},
   number       = {3},
   volume       = {12},
   pages        = {233--251},
   url          = {http://www.acm.org/pubs/articles/journals/tois/1994-12-3/p233-apte/p233-apte.pdf},
   abstract     = {We describe the results of extensive experiments using optimized 
                   rule-based induction methods on large document collections. The 
                   goal of these methods is to discover automatically classification 
                   patterns that can be used for general document categorization or 
                   personalized filtering of free text. Previous reports indicate 
                   that human-engineered rule-based systems, requiring many 
                   man-YEARs of developmental efforts, have been successfully built 
                   to ``read'' documents and assign topics to them. We show that 
                   machine-generated decision rules appear comparable to human 
                   performance, while using the identical rule-based representation. 
                   In comparison with other machine-learning techniques, results on 
                   a key benchmark from the Reuters collection show a large gain in 
                   performance, from a previously reported 67\% recall/precision 
                   breakeven point to 80.5\%. In the context of a very 
                   high-dimensional feature space, several methodological 
                   alternatives are examined, including universal versus local 
                   dictionaries, and binary versus frequency related features.},
}
@inProceedings{Apte94a,
   author       = {Apt\'{e}, Chidanand and Damerau, Fred J. and Weiss, Sholom M.},
   title        = {Towards Language-Independent Automated Learning of Text 
                   Categorization Models},
   booktitle    = {Proceedings of SIGIR-94, 17th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {W. Bruce Croft and Cornelis J. Van Rijsbergen},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Dublin, {IE}},
   pages        = {23--30},
   year         = {1994},
   note         = {An extended version appears as~\cite{Apte94}},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/188490/p23-apte/p23-apte.pdf},
   abstract     = {We describe the results of extensive machine learning experiments 
                   on large collections of Reuters' English and German newswires. 
                   The goal of these experiments was to automatically discover 
                   classification patterns that can be used for assignment of topics 
                   to the individual newswires. Our results with the English 
                   newswire collection show a very large gain in performance as 
                   compared to published benchmarks, while our initial results with 
                   the German newswires appear very promising. We present our 
                   methodology, which seems to be insensitive to the language of the 
                   document collections, and discuss issues related to the 
                   differences in results that we have obtained for the two 
                   collections.},
}
@article{Attardi98,
   author       = {Attardi, Giuseppe and Di Marco, Sergio and Salvi, Davide},
   title        = {Categorization by context},
   journal      = {Journal of Universal Computer Science},
   year         = {1998},
   number       = {9},
   volume       = {4},
   pages        = {719--736},
   url          = {http://www.jucs.org/jucs_4_9/categorisation_by_context},
   abstract     = {Assistance in retrieving of documents on the World Wide Web is 
                   provided either by search engines, through keyword based queries, 
                   or by catalogues, which organise documents into hierarchical 
                   collections. Maintaining catalogues manually is becoming 
                   increasingly difficult due to the sheer amount of material on the 
                   Web, and therefore it will be soon necessary to resort to 
                   techniques for automatic classification of documents. 
                   Classification is traditionally performed by extracting 
                   information for indexing a document from the document itself. The 
                   paper describes the technique of categorisation by context, which 
                   exploits the context perceivable from the structure of HTML 
                   documents to extract useful information for classifying the 
                   documents they refer to. We present the results of experiments 
                   with a preliminary implementation of the technique.},
}
@inProceedings{Attardi99,
   author       = {Giuseppe Attardi and Antonio Gull{\'{\i}} and Fabrizio Sebastiani},
   title        = {Automatic {W}eb Page Categorization by Link and Context Analysis},
   booktitle    = {Proceedings of THAI-99, 1st European Symposium on Telematics, 
                   Hypermedia and Artificial Intelligence},
   editor       = {Chris Hutchison and Gaetano Lanzarone},
   year         = {1999},
   address      = {Varese, {IT}},
   pages        = {105--119},
   url          = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/THAI99.pdf},
   abstract     = {Assistance in retrieving documents on the World Wide Web is 
                   provided either by search engines, through keyword-based queries, 
                   or by catalogues, which organize documents into hierarchical 
                   collections. Maintaining catalogues manually is becoming 
                   increasingly difficult, due to the sheer amount of material on 
                   the Web; it is thus becoming necessary to resort to techniques 
                   for the automatic classification of documents. Automatic 
                   classification is traditionally performed by extracting the 
                   information for representing a document (``indexing'') from the 
                   document itself. The paper describes the novel technique of 
                   categorization by context, which instead extracts useful 
                   information for classifying a document from the context where a 
                   URL referring to it appears. We present the results of 
                   experimenting with Theseus, a classifier that exploits this 
                   technique.},
}
@inProceedings{Avancini03,
   author       = {Henri Avancini and Alberto Lavelli and Bernardo Magnini and 
                   Fabrizio Sebastiani and Roberto Zanoli},
   title        = {Expanding Domain-Specific Lexicons by Term Categorization},
   year         = {2003},
   booktitle    = {Proceedings of SAC-03, 18th ACM Symposium on Applied Computing},
   address      = {Melbourne, {US}},
   publisher    = {{ACM} Press, New York, {US}},
   pages        = {793--797},
   url          = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/SAC03c.pdf},
   abstract     = {We discuss an approach to the automatic expansion of 
                   domain-specific lexicons by means of \emph{term categorization}, 
                   a novel task employing techniques from information retrieval (IR) 
                   and machine learning (ML). Specifically, we view the expansion of 
                   such lexicons as a process of learning previously unknown 
                   associations between terms and \emph{domains}. The process 
                   generates, for each $c_{i}$ in a set $C=\{c_{1},\ldots,c_{m}\}$ 
                   of domains, a lexicon $L^{i}_{1}$, bootstrapping from an initial 
                   lexicon $L^{i}_{0}$ and a set of documents $\theta$ given as 
                   input. The method is inspired by \emph{text categorization} (TC), 
                   the discipline concerned with labelling natural language texts 
                   with labels from a predefined set of domains, or categories. 
                   However, while TC deals with documents represented as vectors in 
                   a space of terms, we formulate the task of term categorization as 
                   one in which terms are (dually) represented as vectors in a space 
                   of documents, and in which terms (instead of documents) are 
                   labelled with domains.},
}
@inProceedings{Baker98,
   author       = {L. Douglas Baker and Andrew K. McCallum},
   title        = {Distributional clustering of words for text classification},
   booktitle    = {Proceedings of SIGIR-98, 21st ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {W. Bruce Croft and Alistair Moffat and Cornelis J. Van Rijsbergen 
                   and Ross Wilkinson and Justin Zobel},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1998},
   address      = {Melbourne, {AU}},
   pages        = {96--103},
   url          = {http://www.cs.cmu.edu/~mccallum/papers/clustering-sigir98.ps.gz},
   abstract     = {We describe the application of distributional clustering to 
                   document classification. This approach clusters words into groups 
                   based on the distribution of class labels associated with each 
                   word. Thus, unlike some other unsupervised 
                   dimensionality-reduction techniques, such as latent semantic 
                   indexing, we are able to compress the feature space much more 
                   aggressively, while still maintaining high document 
                   classification accuracy. Experimental results obtained on three 
                   real-world data sets show that we can reduce the feature 
                   dimensionality by three orders of magnitude and lose only 2\% 
                   accuracy, significantly better than latent semantic indexing, 
                   class-based clustering, feature selection by mutual information, 
                   or Markov-blanket-based feature selection. We also show that less 
                   aggressive clustering sometimes results in improved 
                   classification accuracy over classification without clustering.},
}
@inProceedings{Bao01,
   author       = {Yongguang Bao and Satoshi Aoyama and Xiaoyong Du and Kazutaka 
                   Yamada and Naohiro Ishii},
   title        = {A Rough Set-Based Hybrid Method to Text Categorization},
   booktitle    = {Proceedings of WISE-01, 2nd International Conference on Web 
                   Information Systems Engineering},
   editor       = {M. Tamer {\"{O}}zsu and Hans-J{\"{o}}rg Schek and Katsumi Tanaka 
                   and Yanchun Zhang and Yahiko Kambayashi},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   year         = {2001},
   address      = {Kyoto, {JP}},
   pages        = {254--261},
   url          = {http://dlib.computer.org/conferen/wise/1393/pdf/volume1/13930254.pdf},
   abstract     = {In this paper we present a hybrid text categorization method 
                   based on Rough Sets theory. A central problem in good text 
                   Classification for information filtering and retrieval (IF/IR) is 
                   the high dimensionality of the data. It may contain many 
                   unnecessary and irrelevant features. To cope with this problem, 
                   we propose a hybrid technique using Latent Semantic Indexing 
                   (LSI) and Rough Sets theory (RS) to alleviate this situation. 
                   Given corpora of documents and a training set of examples of 
                   classified documents, the technique locates a minimal set of 
                   co-ordinate keywords to distinguish between classes of documents, 
                   reducing the dimensionality of the keyword vectors. This 
                   simplifies the creation of knowledge-based IF/IR systems, speeds 
                   up their operation, and allows easy editing of the rule bases 
                   employed. Besides, we generate several knowledge base instead of 
                   one knowledge base for the classification of new object, hoping 
                   that the combination of answers of the multiple knowledge bases 
                   result in better performance. Multiple knowledge bases can be 
                   formulated precisely and in a unified way within the framework of 
                   RS. This paper describes the proposed technique, discusses the 
                   integration of a keyword acquisition algorithm, Latent Semantic 
                   Indexing (LSI) with Rough Set-based rule generate algorithm, and 
                   provides experimental results. The test results show the hybrid 
                   method is better than the previous rough set-based approach.},
}
@inProceedings{Basili00,
   author       = {Roberto Basili and Alessandro Moschitti and Maria T. Pazienza},
   title        = {Language-Sensitive Text Classification},
   booktitle    = {Proceeding of RIAO-00, 6th International Conference ``Recherche 
                   d'Information Assistee par Ordinateur''},
   editor       = {},
   address      = {Paris, {FR}},
   year         = {2000},
   pages        = {331--343},
   url          = {},
   abstract     = {It is a traditional belief that in order to scale-up to more 
                   effective retrieval and access methods modern Information 
                   Retrieval has to consider more the text content. The modalities 
                   and techniques to fit this objectives are still under discussion. 
                   More empirical evidence is required to determine the suitable 
                   linguistic levels for modeling each IR subtask (e.g. information 
                   zoning, parsing, feature selection for indexing,...) and the 
                   corresponding use of this information. In this paper an original 
                   classification model sensitive to document syntactic information 
                   and characterized by a novel inference method is described. 
                   Extensive experimental evidence has been derived on real test 
                   data and also from well-established academic test sets. The 
                   results show that a significant improvement can be derived using 
                   the proposed inference model. Also the role of linguistic 
                   preprocessing seems to provide positive effects on the 
                   performance. POS tagging and recognition of Proper Nouns received 
                   a specific experimental attention and provided significant 
                   effects on measured accuracy.},
}
@inProceedings{Basili01,
   author       = {Roberto Basili and Alessandro Moschitti and Maria T. Pazienza},
   title        = {{NLP}-driven {IR}: Evaluating Performances over a Text 
                   Classification task},
   booktitle    = {Proceeding of IJCAI-01, 17th International Joint Conference on 
                   Artificial Intelligence},
   editor       = {Bernhard Nebel},
   address      = {Seattle, {US}},
   year         = {2001},
   pages        = {1286--1291},
   url          = {},
   abstract     = {Although several attempts have been made to introduce Natural 
                   Language Processing (NLP) techniques in Information Retrieval, 
                   most ones failed to prove their effectiveness in increasing 
                   performances. In this paper Text Classification (TC) has been 
                   taken as the IR task and the effect of linguistic capabilities of 
                   the underlying system have been studied. A novel model for TC, 
                   extending a well know statistical model (i.e. Rocchio¹s formula 
                   [Ittner et al., 1995]) and applied to linguistic features has 
                   been defined and experimented. The proposed model represents an 
                   effective feature selection methodology. All the experiments 
                   result in a significant improvement with respect to other purely 
                   statistical methods (e.g. [Yang, 1999]), thus stressing the 
                   relevance of the available linguistic information. Moreover, the 
                   derived classifier reachs the performance (about 85\%) of the 
                   best known models (i.e. Support Vector Machines (SVM) and 
                   k-Nearest Neighbour (KNN)) characterized by an higher 
                   computational complexity for training and processing.},
}
@inProceedings{Basili01a,
   author       = {Roberto Basili and Alessandro Moschitti and Maria T. Pazienza},
   title        = {An hybrid approach to optimize feature selection process in text 
                   classification},
   booktitle    = {Proceedings of AI*IA-01, 7th Congress of the Italian Association 
                   for Artificial Intelligence},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2175},
   editor       = {Floriana Esposito},
   year         = {2001},
   pages        = {320--325},
   address      = {Bari, {IT}},
   url          = {http://link.springer.de/link/service/series/0558/papers/2175/21750320.pdf},
   abstract     = {Feature selection and weighting are the primary activity of every 
                   learning algorithm for text classification. Traditionally these 
                   tasks are carried out individually in two distinct phases: the 
                   first is the global feature selection during a corpus 
                   pre-processing and the second is the application of the feature 
                   weighting model. This means that two (or several) different 
                   techniques are used to optimize the performances even if a single 
                   algorithm may have more chances to operate the right choices. 
                   When the complete feature set is available, the classifier 
                   learning algorithm can better relate to the suitable 
                   representation level the different complex features like 
                   linguistic ones (e.g. syntactic categories associated to words in 
                   the training material or terminological expressions). In [3] it 
                   has been suggested that classifiers based on generalized Rocchio 
                   formula can be used to weight features in category profiles in 
                   order to exploit the selectivity of linguistic information 
                   techniques in text classification. In this paper, a systematic 
                   study aimed to understand the role of Rocchio formula in 
                   selection and weighting of linguistic features will be described.},
}
@inProceedings{Basili01b,
   author       = {Roberto Basili and Alessandro Moschitti},
   title        = {A robust model for intelligent text classification},
   booktitle    = {Proceedings of ICTAI-01, 13th IEEE International Conference on 
                   Tools with Artificial Intelligence},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   editor       = {},
   year         = {2001},
   pages        = {265--272},
   address      = {Dallas, {US}},
   url          = {http://dlib.computer.org/conferen/ictai/1417/pdf/14170265.pdf},
   abstract     = {Methods for taking into account linguistic content into text 
                   retrieval are receiving a growing attention [16],[14]. Text 
                   categorization is an interesting area for evaluating and 
                   quantifying the impact of linguistic information. Works in text 
                   retrieval through Internet suggest that embedding linguistic 
                   information at a suitable level within traditional quantitative 
                   approaches (e.g. sense distinctions for query expansion as in 
                   [14]) is the crucial issue able to bring the experimental stage 
                   to operational results.This kind of representational problem is 
                   also studied in this paper where traditional methods for 
                   statistical text categorization are augmented via a systematic 
                   use of linguistic information. Again, as in [14], the addition of 
                   NLP capabilities also suggested a different application of 
                   existing methods in revised forms. This paper presents an 
                   extension of the Rocchio formula [11] as a feature weighting and 
                   selection model used as a basis for multilingual Information 
                   Extraction. It allows an effective exploitation of the available 
                   linguistic information that better emphasizes this latter with 
                   significant both data compression and accuracy. The results is an 
                   original statistical classifier fed with linguistic (i.e. more 
                   complex) features and characterized by the novel feature 
                   selection and weighting model. It outperforms existing systems by 
                   keeping most of their interesting properties (i.e. easy 
                   implementation, low complexity and high scalability). Extensive 
                   tests of the model suggest its application as a viable and robust 
                   tool for large scale text classification and filtering, as well 
                   as a basic module for more complex scenarios.},
}
@article{Bayer98,
   author       = {Thomas Bayer and Ulrich Kressel and Heike Mogg-Schneider and 
                   Ingrid Renz},
   title        = {Categorizing paper documents. A generic system for domain and 
                   language independent text categorization},
   journal      = {Computer Vision and Image Understanding},
   year         = {1998},
   number       = {3},
   volume       = {70},
   pages        = {299--306},
   url          = {http://www.idealibrary.com/links/doi/10.1006/cviu.1998.0687/pdf},
   abstract     = {Text categorization assigns predefined categories to either 
                   electronically available texts or those resulting from document 
                   image analysis. A generic system for text categorization is 
                   presented which is based on statistical analysis of 
                   representative text corpora. Significant features are 
                   automatically derived from training texts by selecting substrings 
                   from actual word forms and applying statistical information and 
                   general linguistic knowledge. The dimension of the feature 
                   vectors is then reduced by linear transformation, keeping the 
                   essential information. The classification is a minimum 
                   least-squares approach based on polynomials. The described system 
                   can be efficiently adapted to new domains or different languages. 
                   In application, the adapted text categorizers are reliable, fast, 
                   and completely automatic. Two example categorization tasks 
                   achieve recognition scores of approximately 80\% and are very 
                   robust against recognition or typing errors.},
}
@inProceedings{Bekkerman01,
   author       = {Ron Bekkerman and Ran El-Yaniv and Naftali Tishby and Yoad Winter},
   title        = {On Feature Distributional Clustering for Text Categorization},
   booktitle    = {Proceedings of SIGIR-01, 24th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {W. Bruce Croft and David J. Harper and Donald H. Kraft and Justin 
                   Zobel},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {New Orleans, {US}},
   year         = {2001},
   pages        = {146--153},
   url          = {http://www.cs.huji.ac.il/labs/learning/Papers/sigir.ps.gz},
   abstract     = {We describe a text categorization approach that is based on a 
                   combination of feature distributional clusters with a support 
                   vector machine (SVM) classifier. Our feature selection approach 
                   employs distributional clustering of words via the recently 
                   introduced information bottleneck method, which generates a more 
                   efficient word-cluster representation of documents. Combined with 
                   the classification power of an SVM, this method yields high 
                   performance text categorization that can outperform other recent 
                   methods in terms of categorization accuracy and representation 
                   efficiency. Comparing the accuracy of our method with other 
                   techniques, we observe significant dependency of the results on 
                   the data set. We discuss the potential reasons for this 
                   dependency.},
}
@inProceedings{Bel03,
   author       = {Nuria Bel and Cornelis H. Koster and Marta Villegas},
   title        = {Cross-lingual text categorization},
   booktitle    = {Proceedings of ECDL-03, 7th European Conference on Research and 
                   Advanced Technology for Digital Libraries},
   editor       = {Traugott Koch and Torvik S{\o}lvberg, Ingeborg},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2769},
   year         = {2003},
   address      = {Trondheim, {NO}},
   pages        = {126--139},
   url          = {},
   abstract     = {},
}
@inProceedings{Benkhalifa99,
   author       = {Benkhalifa, Mohamed and Bensaid, Amine and Mouradi, Abdelhak},
   title        = {Text categorization using the semi-supervised fuzzy c-means 
                   algorithm},
   booktitle    = {Proceedings of NAFIPS-99, 18th International Conference of the 
                   North American Fuzzy Information Processing Society},
   address      = {New York, {US}},
   pages        = {561--565},
   year         = {1999},
   url          = {},
   abstract     = {Text categorization (TC) is the automated assignment of text 
                   documents to predefined categories based on document contents. TC 
                   has become very important in the information retrieval area, 
                   where information needs have tremendously increased with the 
                   rapid growth of textual information sources such as the Internet. 
                   We compare, for text categorization, two partially supervised (or 
                   semi-supervised) clustering algorithms: the Semi-Supervised 
                   Agglomerative Hierarchical Clustering (ssAHC) algorithm (A. Amar 
                   et al., 1997) and the Semi-Supervised Fuzzy-c-Means (ssFCM) 
                   algorithm (M. Amine et al., 1996). This (semi-supervised) 
                   learning paradigm falls somewhere between the fully supervised 
                   and the fully unsupervised learning schemes, in the sense that it 
                   exploits both class information contained in labeled data 
                   (training documents) and structure information possessed by 
                   unlabeled data (test documents) in order to produce better 
                   partitions for test documents. Our experiments, make use of the 
                   Reuters 21578 database of documents and consist of a binary 
                   classification for each of the ten most populous categories of 
                   the Reuters database. To convert the documents into vector form, 
                   we experiment with different numbers of features, which we 
                   select, based on an information gain criterion. We verify 
                   experimentally that ssFCM both outperforms and takes less time 
                   than the Fuzzy-c-Means (FCM) algorithm. With a smaller number of 
                   features, ssFCM's performance is also superior to that of 
                   ssAHC's. Finally ssFCM results in improved performance and faster 
                   execution time as more weight is given to training documents.},
}
@article{Benkhalifa01,
   author       = {Mohammed Benkhalifa and Abdelhak Mouradi and Houssaine Bouyakhf},
   title        = {Integrating External Knowledge to Supplement Training Data in 
                   Semi-Supervised Learning for Text Categorization},
   journal      = {Information Retrieval},
   number       = {2},
   volume       = {4},
   pages        = {91--113},
   year         = {2001},
   url          = {http://www.wkap.nl/article.pdf?351286},
   abstract     = {Text Categorization (TC) is the automated assignment of text 
                   documents to predefined categories based on document contents. TC 
                   has been an application for many learning approaches, which prove 
                   effective. Nevertheless, TC provides many challenges to machine 
                   learning. In this paper, we suggest, for text categorization, the 
                   integration of external WordNet lexical information to supplement 
                   training data for a semi-supervised clustering algorithm which 
                   can learn from both training and test documents to classify new 
                   unseen documents. This algorithm is the ``Semi-Supervised Fuzzy 
                   c-Means'' (ssFCM). Our experiments use Reuters 21578 database and 
                   consist of binary classifications for categories selected from 
                   the 115 TOPICS classes of the Reuters collection. Using the 
                   Vector Space Model, each document is represented by its original 
                   feature vector augmented with external feature vector generated 
                   using WordNet. We verify experimentally that the integration of 
                   WordNet helps ssFCM improve its performance, effectively 
                   addresses the classification of documents into categories with 
                   few training documents and does not interfere with the use of 
                   training data.},
}
@article{Benkhalifa01a,
   author       = {Mohammed Benkhalifa and Abdelhak Mouradi and Houssaine Bouyakhf},
   title        = {Integrating {WordNet} knowledge to supplement training data in 
                   semi-supervised agglomerative hierarchical clustering for text 
                   categorization},
   journal      = {International Journal of Intelligent Systems},
   pages        = {929--947},
   year         = {2001},
   volume       = {16},
   number       = {8},
   url          = {http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=84503376&PLACEBO=IE.pdf},
   abstract     = {The text categorization (TC) is the automated assignment of text 
                   documents to predefined categories based on document contents. TC 
                   has been an application for many learning approaches. which 
                   proved effective. Nevertheless, TC provides many challenges to 
                   machine learning. In this paper. we suggest, for text 
                   categorization, the integration of external WordNet lexical 
                   information to supplement training data for a semi-supervised 
                   clustering algorithm which (i) uses a finite design set of 
                   labeled data to (ii) help agglomerative hierarchical clustering 
                   algorithms (AHC) partition a finite set of unlabeled data and 
                   then (iii) terminates without the capacity to classify other 
                   objects. This algorithm is the "semi-supervised agglomerative 
                   hierarchical clustering algorithm" (ssAHC). Our experiments use 
                   Reuters 21578 database and consist of binary classifications for 
                   categories selected from the 89 TOPICS classes of the Reuters 
                   collection. Using the vector space model (VSM), each document is 
                   represented by its original feature vector augmented with 
                   external feature vector generated using WordNet. We verify 
                   experimentally that the integration of WordNet helps ssAHC 
                   improve its performance, effectively addresses the classification 
                   of documents into categories with few training documents. and 
                   does not interfere with the use of training data.},
}
@inProceedings{Bennett02,
   author       = {Paul N. Bennett and Susan T. Dumais and Eric Horvitz},
   title        = {Probabilistic combination of text classifiers using reliability 
                   indicators: models and results},
   booktitle    = {Proceedings of SIGIR-02, 25th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng 
                   and Kalervo J{\"{a}}rvelin},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Tampere, {FI}},
   year         = {2002},
   pages        = {207--214},
   url          = {http://doi.acm.org/10.1145/564376.564413},
   abstract     = {The intuition that different text classifiers behave in 
                   qualitatively different ways has long motivated attempts to build 
                   a better metaclassifier via some combination of classifiers. We 
                   introduce a probabilistic method for combining classifiers that 
                   considers the context-sensitive reliabilities of contributing 
                   classifiers. The method harnesses reliability 
                   indicators---variables that provide a valuable signal about the 
                   performance of classifiers in different situations. We provide 
                   background, present procedures for building metaclassifiers that 
                   take into consideration both reliability indicators and 
                   classifier outputs, and review a set of comparative studies 
                   undertaken to evaluate the methodology.},
}
@inProceedings{Bennett03,
   author       = {Paul N. Bennett},
   title        = {Using asymmetric distributions to improve text classifier 
                   probability estimates},
   booktitle    = {Proceedings of SIGIR-03, 26th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Jamie Callan and Gordon Cormack and Charles Clarke and David 
                   Hawking and Alan Smeaton},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Toronto, {CA}},
   year         = {2003},
   pages        = {111--118},
   url          = {http://doi.acm.org/10.1145/860435.860457},
   abstract     = {Text classifiers that give probability estimates are more readily 
                   applicable in a variety of scenarios. For example, rather than 
                   choosing one set decision threshold, they can be used in a 
                   Bayesian risk model to issue a run-time decision which minimizes 
                   a user-specified cost function dynamically chosen at prediction 
                   time. However, the quality of the probability estimates is 
                   crucial. We review a variety of standard approaches to converting 
                   scores (and poor probability estimates) from text classifiers to 
                   high quality estimates and introduce new models motivated by the 
                   intuition that the empirical score distribution for the 
                   "extremely irrelevant", "hard to discriminate", and "obviously 
                   relevant" items are often significantly different. Finally, we 
                   analyze the experimental performance of these models over the 
                   outputs of two text classifiers. The analysis demonstrates that 
                   one of these models is theoretically attractive (introducing few 
                   new parameters while increasing flexibility), computationally 
                   efficient, and empirically preferable.},
}
@inProceedings{Biebricher88,
   author       = {Peter Biebricher and Norbert Fuhr and Gerhard Knorz and Gerhard 
                   Lustig and Michael Schwantner},
   title        = {The automatic indexing system {AIR/PHYS}. {F}rom research to 
                   application},
   booktitle    = {Proceedings of SIGIR-88, 11th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Yves Chiaramella},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Grenoble, {FR}},
   pages        = {333--342},
   year         = {1988},
   note         = {Reprinted in Karen Sparck Jones and Peter Willett (eds.), 
                   ``Readings in Information Retrieval'', Morgan Kaufmann, San 
                   Francisco, US, 1997, pp.\ 513--517.},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/62437/p333-biebricher/p333-biebricher.pdf},
   abstract     = {Since October 1985, the automatic indexing system AIR/PHYS has 
                   been used in the input production of the physics data base of the 
                   Fachinformationszentrum Karlsruhe/West Germany. The texts to be 
                   indexed are abstracts written in English. The system of 
                   descriptors is prescribed. For the application of the AIR/PHYS 
                   system a large-scale dictionary containing more than 600000 
                   word-descriptor relations resp. phrase-descriptor relations has 
                   been developed. Most of these relations have been obtained by 
                   means of statistical and heuristical methods. In consequence, the 
                   relation system is rather imperfect. Therefore, the indexing 
                   system needs some fault-tolerating features. An appropriate 
                   indexing approach and the corresponding structure of the AIR/PHYS 
                   system are described. Finally, the conditions of the application 
                   as well as problems of further development are discussed.},
}
@inProceedings{Bigi03,
   author       = {Brigitte Bigi},
   title        = {Using {K}ullback-{L}eibler distance for text categorization},
   booktitle    = {Proceedings of ECIR-03, 25th European Conference on Information 
                   Retrieval},
   publisher    = {Springer Verlag},
   editor       = {Fabrizio Sebastiani},
   address      = {Pisa, {IT}},
   year         = {2003},
   pages        = {305--319},
   url          = {http://link.springer.de/link/service/series/0558/papers/2633/26330305.pdf},
   abstract     = {A system that performs text categorization aims to assign 
                   appropriate categories from a predefined classification scheme to 
                   incoming documents. These assignments might be used for varied 
                   purposes such as filtering, or retrieval. This paper introduces a 
                   new effective model for text categorization with great corpus 
                   (more or less 1 million documents). Text categorization is 
                   performed using the Kullback-Leibler distance between the 
                   probability distribution of the document to classify and the 
                   probability distribution of each category. Using the same 
                   representation of categories, experiments show a significant 
                   improvement when the above mentioned method is used. KLD method 
                   achieve substantial improvements over the tfidf performing 
                   method.},
}
@article{Blei03,
   author       = {David M. Blei and Andrew Y. Ng and Michael I. Jordan},
   title        = {Latent {D}irichlet Allocation},
   journal      = {Journal of Machine Learning Research},
   volume       = {3},
   pages        = {993--1022},
   year         = {2003},
   url          = {http://www.ai.mit.edu/projects/jmlr/papers/volume3/blei03a/blei03a.pdf},
   abstract     = {We describe latent Dirichlet allocation (LDA), a generative 
                   probabilistic model for collections of discrete data such as text 
                   corpora. LDA is a three-level hierarchical Bayesian model, in 
                   which each item of a collection is modeled as a finite mixture 
                   over an underlying set of topics. Each topic is, in turn, modeled 
                   as an infinite mixture over an underlying set of topic 
                   probabilities. In the context of text modeling, the topic 
                   probabilities provide an explicit representation of a document. 
                   We present efficient approximate inference techniques based on 
                   variational methods and an EM algorithm for empirical Bayes 
                   parameter estimation. We report results in document modeling, 
                   text classification, and collaborative filtering, comparing to a 
                   mixture of unigrams model and the probabilistic LSI model.},
}
@article{Bloedorn98,
   author       = {Eric Bloedorn and Ryszard S. Michalski},
   title        = {Data-Driven Constructive Induction},
   journal      = {{IEEE} Intelligent Systems},
   year         = {1998},
   number       = {2},
   volume       = {13},
   pages        = {30--37},
   url          = {http://dlib.computer.org/ex/books/ex1998/pdf/x2030.pdf},
   abstract     = {An inductive learning program¹s ability to find an accurate 
                   hypothesis can depend on the quality of the representation space. 
                   The authors developed a data-driven constructive-induction method 
                   that uses multiple operators to improve the representation space. 
                   They applied it to two real-world problems.},
}
@inProceedings{Blosseville92,
   author       = {M.J. Blosseville and Georges Hebrail and M.G. Montell and N. 
                   Penot},
   title        = {Automatic document classification: natural langage processing and 
                   expert system techniques used together},
   booktitle    = {Proceedings of SIGIR-92, 15th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Nicholas J. Belkin and Peter Ingwersen and Annelise Mark 
                   Pejtersen},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Kobenhavn, {DK}},
   pages        = {51--57},
   year         = {1992},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/133160/p51-blosseville/p51-blosseville.pdf},
   abstract     = {In this paper we describe an automated method of classifying 
                   research project descriptions: a human expert classifies a sample 
                   set of projects into a set of disjoint and pre-defined classes, 
                   and then the computer learns from this sample how to classify new 
                   projects into these classes. Both textual and non-textual 
                   information associated with the projects are used in the learning 
                   and classification phases. Textual information is processed by 
                   two methods of analysis: a natural language analysis followed by 
                   a statistical analysis. Non-textual information is processed by a 
                   symbolic learning technique. We present the results of some 
                   experiments done on real data: two different classifications of 
                   our research projects.},
}
@article{Borko63,
   author       = {Harold Borko and Myrna Bernick},
   title        = {Automatic document classification},
   journal      = {Journal of the Association for Computing Machinery},
   year         = {1963},
   volume       = {10},
   number       = {2},
   pages        = {151--161},
   url          = {http://www.acm.org/pubs/articles/journals/jacm/1963-10-2/p151-borko/p151-borko.pdf},
}
@article{Borko64,
   author       = {Harold Borko and Myrna Bernick},
   title        = {Automatic document classification. Part II: additional 
                   experiments},
   journal      = {Journal of the Association for Computing Machinery},
   year         = {1964},
   volume       = {11},
   number       = {2},
   pages        = {138--151},
   url          = {http://www.acm.org/pubs/articles/journals/jacm/1964-11-2/p138-borko/p138-borko.pdf},
   abstract     = {This study reports the results of a series of experiments in the 
                   techniques of automatic document classifications. Two different 
                   classification schedules are compared along with two methods of 
                   automatically classifying documents into categories. It is 
                   concluded that, while there is no significant difference in the 
                   predictive efficiency between the Bayesian and the Factor Score 
                   methods, automatic document classification is enhanced by the use 
                   of a factor-analytically-derived classification schedule. 
                   Approximately 55 percent of the documents were automatedly and 
                   correctly classified.},
}
@inProceedings{Brank02a,
   author       = {Janez Brank and Marko Grobelnik and Natasa Mili{\'{c}}-Frayling 
                   and Dunja Mladeni{\'{c}}},
   title        = {Feature selection using support vector machines},
   booktitle    = {Proceedings of the 3rd International Conference on Data Mining 
                   Methods and Databases for Engineering, Finance, and Other Fields},
   year         = {2002},
   pages        = {},
   address      = {Bologna, {IT}},
   url          = {http://www.brank.org/msr/FsNormal/Bologna/bologna-paper-4.pdf},
   abstract     = {Text categorization is the task of classifying natural language 
                   documents into a set of predefined categories. Documents are 
                   typically represented by sparse vectors under the vector space 
                   model, where each word in the vocabulary is mapped to one 
                   coordinate axis and its occurrence in the document gives rise to 
                   one nonzero component in the vector representing that document. 
                   When training classifiers on large collections of documents, both 
                   the time and memory requirements connected with processing of 
                   these vectors may be prohibitive. This calls for using a feature 
                   selection method, not only to reduce the number of features but 
                   also to increase the sparsity of document vectors. We propose a 
                   feature selection method based on linear Support Vector Machines 
                   (SVMs). First, we train the linear SVM on a subset of training 
                   data and retain only those features that correspond to highly 
                   weighted components (in absolute value sense) of the normal to 
                   the resulting hyperplane that separates positive and negative 
                   examples. This reduced feature space is then used to train a 
                   classifier over a larger training set because more documents now 
                   fit into the same amount of memory. In our experiments we compare 
                   the effectiveness of the SVM -based feature selection with that 
                   of more traditional feature selection methods, such as odds ratio 
                   and information gain, in achieving the desired tradeoff between 
                   the vector sparsity and the classification performance. 
                   Experimental results indicate that, at the same level of vector 
                   sparsity, feature selection based on SVM normals yields better 
                   classification performance than odds ratio- or information 
                   gainbased feature selection when linear SVM classifiers are used.},
}
@inProceedings{Bruckner97,
   author       = {T. Bruckner},
   title        = {The text categorization system {TEKLIS} at {TREC-6}},
   booktitle    = {Proceedings of TREC-6, 6th Text Retrieval Conference},
   publisher    = {National Institute of Standards and Technology, Gaithersburg, {US}},
   editor       = {Ellen M. Voorhees and Donna K. Harman},
   year         = {1997},
   address      = {Gaithersburg, {US}},
   pages        = {619--621},
   url          = {http://trec.nist.gov/pubs/trec6/papers/siemens.ps.gz},
   abstract     = {The article documents the author's participation in the filtering 
                   and routing tasks of TREC-6 with the commercial filtering system 
                   TEKLIS. TEKLIS is a training based statistical categorization 
                   system which incorporates shallow linguistic processing and fuzzy 
                   set methods. The author presents the core technology of TEKLIS, 
                   the results on the filtering and routing tasks and a discussion 
                   of the insights gained through participation in the exercise.},
}
@inProceedings{Cai03,
   author       = {Lijuan Cai and Thomas Hofmann},
   title        = {Text categorization by boosting automatically extracted concepts},
   booktitle    = {Proceedings of SIGIR-03, 26th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Jamie Callan and Gordon Cormack and Charles Clarke and David 
                   Hawking and Alan Smeaton},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Toronto, {CA}},
   year         = {2003},
   pages        = {182--189},
   url          = {http://doi.acm.org/10.1145/860435.860469},
   abstract     = {A novel maximal figure-of-merit (MFoM) learning approach to text 
                   categorization is proposed. Different from the conventional 
                   techniques, the proposed MFoM method attempts to integrate any 
                   performance metric of interest (e.g. accuracy, recall, precision, 
                   or F1 measure) into the design of any classifier. The 
                   corresponding classifier parameters are learned by optimizing an 
                   overall objective function of interest. To solve this highly 
                   nonlinear optimization problem, we use a generalized 
                   probabilistic descent algorithm. The MFoM learning framework is 
                   evaluated on the Reuters-21578 task with LSI-based feature 
                   extraction and a binary tree classifier. Experimental results 
                   indicate that the MFoM classifier gives improved F1 and enhanced 
                   robustness over the conventional one. It also outperforms the 
                   popular SVM method in micro-averaging F1. Other extensions to 
                   design discriminative multiple-category MFoM classifiers for 
                   application scenarios with new performance metrics could be 
                   envisioned too.},
}
@article{Carbonell00,
   author       = {Jaime Carbonell and William W. Cohen and Yiming Yang},
   title        = {Guest editors' introduction to the special issue on machine 
                   learning and information retrieval},
   journal      = {Machine Learning},
   volume       = {39},
   number       = {2/3},
   pages        = {99--101},
   year         = {2000},
   url          = {http://www.wkap.nl/article.pdf?255754},
}
@inCollection{Caropreso01,
   author       = {Maria Fernanda Caropreso and Stan Matwin and Fabrizio Sebastiani},
   title        = {A learner-independent evaluation of the usefulness of statistical 
                   phrases for automated text categorization},
   year         = {2001},
   booktitle    = {Text Databases and Document Management: Theory and Practice},
   editor       = {Amita G. Chin},
   publisher    = {Idea Group Publishing},
   address      = {Hershey, {US}},
   pages        = {78--102},
   url          = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/TD01a.pdf},
   abstract     = {In this work we investigate the usefulness of {\em $n$-grams} for 
                   document indexing in text categorization (TC). We call $n$-gram a 
                   set $g_k$ of $n$ word stems, and we say that $g_k$ occurs in a 
                   document $d_j$ when a sequence of words appears in $d_j$ that, 
                   after stop word removal and stemming, consists exactly of the $n$ 
                   stems in $g_k$, in some order. Previous researches have 
                   investigated the use of $n$-grams (or some variant of them) in 
                   the context of specific learning algorithms, and thus have not 
                   obtained general answers on their usefulness for TC. In this work 
                   we investigate the usefulness of $n$-grams in TC independently of 
                   any specific learning algorithm. We do so by applying feature 
                   selection to the pool of all $k$-grams ($k\leq n$), and checking 
                   how many $n$-grams score high enough to be selected in the top 
                   $\sigma$ $k$-grams. We report the results of our experiments, 
                   using various feature selection measures and varying values of 
                   $\sigma$, performed on the {\sc Reuters-21578} standard TC 
                   benchmark. We also report results of making actual use of the 
                   selected $n$-grams in the context of a linear classifier induced 
                   by means of the Rocchio method.},
}
@inProceedings{Carreras01,
   author       = {Xavier Carreras and Llu\'{\i}s M\'arquez},
   title        = {Boosting Trees for Anti-Spam Email Filtering},
   year         = {2001},
   editor       = {},
   booktitle    = {Proceedings of RANLP-01, 4th International Conference on Recent 
                   Advances in Natural Language Processing},
   address      = {Tzigov Chark, {BG}},
   pages        = {},
   url          = {http://www.lsi.upc.es/~carreras/pub/boospam.ps},
}
@inProceedings{Cavnar94,
   author       = {William B. Cavnar and John M. Trenkle},
   title        = {N-Gram-Based Text Categorization},
   booktitle    = {Proceedings of SDAIR-94, 3rd Annual Symposium on Document 
                   Analysis and Information Retrieval},
   publisher    = {},
   editor       = {},
   year         = {1994},
   address      = {Las Vegas, {US}},
   pages        = {161--175},
   url          = {http://www.nonlineardynamics.com/trenkle/papers/sdair-94-bc.ps.gz},
   abstract     = {Text categorization is a fundamental task in doc-ument 
                   processing, allowing the automated handling of enormous streams 
                   of documents in electronic form. One difficulty in handling some 
                   classes of documents is the presence of different kinds of 
                   textual errors, such as spelling and grammatical errors in email, 
                   and character recognition errors in documents that come through 
                   OCR. Text categorization must work reliably on all input, and 
                   thus must tolerate some level of these kinds of problems. We 
                   describe here an N-gram-based approach to text categorization 
                   that is tolerant of textual errors. The system is small, fast and 
                   robust. This system worked very well for language classification, 
                   achieving in one test a 99.8\% correct classification rate on 
                   Usenet newsgroup articles written in different languages. The 
                   system also worked reasonably well for classifying articles from 
                   a number of different computer-oriented newsgroups according to 
                   subject, achieving as high as an 80\% correct classification 
                   rate. There are also several obvious directions for improving the 
                   system¹s classification performance in those cases where it did 
                   not do as well. The system is based on calculating and comparing 
                   profiles of N-gram frequencies. First, we use the system to 
                   compute profiles on training set data that represent the various 
                   categories, e.g., language samples or newsgroup content samples. 
                   Then the system computes a profile for a particular document that 
                   is to be classified. Finally, the system computes a distance 
                   measure between the document¹s profile and each of the category 
                   profiles. The system selects the category whose profile has the 
                   smallest distance to the document¹s profile. The profiles 
                   involved are quite small, typically 10K bytes for a category 
                   training set, and less than 4K bytes for an individual document. 
                   Using N-gram frequency profiles provides a simple and reliable 
                   way to categorize documents in a wide range of classification 
                   tasks.},
}
@inProceedings{Ceci03,
   author       = {Michelangelo Ceci and Donato Malerba},
   title        = {Hierarchical Classification of {HTML} Documents with {WebClassII}},
   booktitle    = {Proceedings of ECIR-03, 25th European Conference on Information 
                   Retrieval},
   publisher    = {Springer Verlag},
   editor       = {Fabrizio Sebastiani},
   address      = {Pisa, {IT}},
   year         = {2003},
   pages        = {57--72},
   url          = {http://link.springer.de/link/service/series/0558/papers/2633/26330057.pdf},
   abstract     = {This paper describes a new method for the classification of a 
                   HTML document into a hierarchy of categories. The hierarchy of 
                   categories is involved in all phases of automated document 
                   classification, namely feature extraction, learning, and 
                   classification of a new document. The innovative aspects of this 
                   work are the feature selection process, the automated threshold 
                   determination for classification scores, and an experimental 
                   study on real-word Web documents that can be associated to any 
                   node in the hierarchy. Moreover, a new measure for the evaluation 
                   of system performances has been introduced in order to compare 
                   three different techniques (flat, hierarchical with proper 
                   training sets, hierarchical with hierarchical training sets). The 
                   method has been implemented in the context of a client-server 
                   application, named WebClassII. Results show that for hierarchical 
                   techniques it is better to use hierarchical training sets.},
}
@inProceedings{Cerny83,
   author       = {Barbara A. Cerny and Anna Okseniuk and J. Dennis Lawrence},
   title        = {A fuzzy measure of agreement between machine and manual 
                   assignment of documents to subject categories},
   booktitle    = {Proceedings of ASIS-83, 46th Annual Meeting of the American 
                   Society for Information Science},
   publisher    = {American Society for Information Science, Washington, {US}},
   editor       = {Raymond F. Vondran and Anne Caputo and Carol Wasserman and 
                   Richard A. Diener},
   year         = {1983},
   address      = {Washington, {US}},
   pages        = {265},
   url          = {},
}
@inProceedings{Chakrabarti97,
   author       = {Soumen Chakrabarti and Byron E. Dom and Rakesh Agrawal and 
                   Prabhakar Raghavan},
   title        = {Using taxonomy, discriminants, and signatures for navigating in 
                   text databases},
   booktitle    = {Proceedings of VLDB-97, 23rd International Conference on Very 
                   Large Data Bases},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   editor       = {Matthias Jarke and Michael J. Carey and Klaus R. Dittrich and 
                   Frederick H. Lochovsky and Pericles Loucopoulos and Manfred A. 
                   Jeusfeld},
   year         = {1997},
   address      = {Athens, {GR}},
   pages        = {446--455},
   url          = {http://www.vldb.org/conf/1997/P446.PDF},
   note         = {An extended version appears as~\cite{Chakrabarti98c}},
   abstract     = {We explore how to organize a text database hierarchically to aid 
                   better searching and browsing. We propose to exploit the natural 
                   hierarchy of topics, or taxonomy, that many corpora, such as 
                   internet directories, digital libraries, and patent databases 
                   enjoy. In our system, the user navigates through the query 
                   response not as a flat unstructured list, but embedded in the 
                   familiar taxonomy, and annotated with document signatures 
                   computed dynamically with respect to where the user is located at 
                   any time. We show how to update such databases with new documents 
                   with high speed and accuracy. We use techniques from statistical 
                   pattern recognition to efficiently separate the feature words or 
                   discriminants from the noise words at each node of the taxonomy. 
                   Using these, we build a multi-level classifier. At each node, 
                   this classifier can ignore the large number of noise words in a 
                   document. Thus the classifier has a small model size and is very 
                   fast. However, owing to the use of context-sensitive features, 
                   the classifier is very accurate. We report on experiences with 
                   the Reuters newswire benchmark, the US Patent database, and web 
                   document samples from {{\sc Yahoo!}}\.},
}
@article{Chakrabarti98c,
   author       = {Soumen Chakrabarti and Byron E. Dom and Rakesh Agrawal and 
                   Prabhakar Raghavan},
   title        = {Scalable feature selection, classification and signature 
                   generation for organizing large text databases into hierarchical 
                   topic taxonomies},
   journal      = {Journal of Very Large Data Bases},
   year         = {1998},
   number       = {3},
   volume       = {7},
   pages        = {163--178},
   url          = {http://www.cs.berkeley.edu/~soumen/VLDB54_3.PDF},
   abstract     = {We explore how to organize large text databases hierarchically by 
                   topic to aid better searching, browsing and filtering. Many 
                   corpora, such as internet directories, digital libraries, and 
                   patent databases are manually organized into topic hierarchies, 
                   also called taxonomies. Similar to indices for relational data, 
                   taxonomies make search and access more efficient. However, the 
                   exponential growth in the volume of on-line textual information 
                   makes it nearly impossible to maintain such taxonomic 
                   organization for large, fast-changing corpora by hand. We 
                   describe an automatic system that starts with a small sample of 
                   the corpus in which topics have been assigned by hand, and then 
                   updates the database with new documents as the corpus grows, 
                   assigning topics to these new documents with high speed and 
                   accuracy. To do this, we use techniques from statistical pattern 
                   recognition to efficiently separate the feature words, or 
                   discriminants, from thenoise words at each node of the taxonomy. 
                   Using these, we build a multilevel classifier. At each node, this 
                   classifier can ignore the large number of ``noise'' words in a 
                   document. Thus, the classifier has a small model size and is very 
                   fast. Owing to the use of context-sensitive features, the 
                   classifier is very accurate. As a by-product, we can compute for 
                   each document a set of terms that occur significantly more often 
                   in it than in the classes to which it belongs. We describe the 
                   design and implementation of our system, stressing how to exploit 
                   standard, efficient relational operations like sorts and joins. 
                   We report on experiences with the Reuters newswire benchmark, the 
                   US patent database, and web document samples from Yahoo!. We 
                   discuss applications where our system can improve searching and 
                   filtering capabilities.},
}
@inProceedings{Chakrabarti98b,
   author       = {Soumen Chakrabarti and Byron E. Dom and Piotr Indyk},
   title        = {Enhanced hypertext categorization using hyperlinks},
   booktitle    = {Proceedings of SIGMOD-98, ACM International Conference on 
                   Management of Data},
   editor       = {Laura M. Haas and Ashutosh Tiwary},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Seattle, {US}},
   year         = {1998},
   pages        = {307--318},
   url          = {http://www.acm.org/pubs/articles/proceedings/mod/276304/p307-chakrabarti/p307-chakrabarti.pdf},
   abstract     = {A major challenge in indexing unstructured hypertext databases is 
                   to automatically extract meta-data that enables structured 
                   searching using topic taxonomies, circumvents keyword ambiguity 
                   and improves the quality of searching and profile-based routing 
                   and filtering. Therefore, an accurate classifier is an essential 
                   component of a hypertext database. Hyperlinks pose new problems 
                   not addressed in the extensive text classification literature. 
                   Links clearly contain high-quality semantic clues that are lost 
                   upon a purely term-based classifier, but exploiting link 
                   information is non-trivial because it is noisy. Naive use of 
                   terms in the link neighborhood of a document can even degrade 
                   accuracy. Our contribution is to propose robust statistical 
                   models and a relaxation labeling technique for better 
                   classification by exploiting link information in a small 
                   neighborhood around documents. Our technique also adapts 
                   gracefully to the fraction of neighboring documents having known 
                   topics. We experimented with pre-classified samples from {{\sc 
                   Yahoo!}}\ and the US Patent Database. We have developed a text 
                   classifier that misclassifies only 13\% of the documents in the 
                   Reuters benchmark; this is comparable to the best results ever 
                   obtained. Our new classifier misclassified 36\% of the patents, 
                   indicating that classifying hypertext can be more difficult than 
                   classifying text. Naively using terms in neighboring documents 
                   increased the error to 38\%; our hypertext classifier reduced it 
                   to 21\%. Results with the Yahoo! sample were more dramatic: the 
                   text classifier showed a 68\% error, whereas our hypertext 
                   classifier reduced this to just 21\%.},
}
@article{Chakrabarti99,
   author       = {Soumen Chakrabarti and Byron E. Dom and S. Ravi Kumar and 
                   Prabhakar Raghavan and Sridhar Rajagopalan and Andrew Tomkins and 
                   David Gibson and Jon Kleinberg},
   title        = {Mining the {W}eb's link structure},
   journal      = {Computer},
   year         = {1999},
   number       = {8},
   volume       = {32},
   pages        = {60--67},
   url          = {http://dlib.computer.org/co/books/co1999/pdf/r8060.pdf},
   abstract     = {The Web is a hypertext body of approximately 300 million pages 
                   that continues to grow at roughly a million pages per day. Page 
                   variation is more prodigious than the data's raw scale: Taken as 
                   a whole, the set of Web pages lacks a unifying structure and 
                   shows far more authoring style and content variation than that 
                   seen in traditional text-document collections. This level of 
                   complexity makes an ``off-the-shelf'' database-management and 
                   information-retrieval solution impossible. To date, index-based 
                   search engines for the Web have been the primary tool by which 
                   users search for information. Such engines can build giant 
                   indices that let you quickly retrieve the set of all Web pages 
                   containing a given word or string. Experienced users can make 
                   effective use of such engines for tasks that can be solved by 
                   searching for tightly constrained keywords and phrases. These 
                   search engines are, however, unsuited for a wide range of equally 
                   important tasks. In particular, a topic of any breadth will 
                   typically contain several thousand or million relevant Web pages. 
                   How then, from this sea of pages, should a search engine select 
                   the correct ones-those of most value to the user?},
}
@article{Chakrabarti02,
   author       = {Soumen Chakrabarti and Shourya Roy and Mahesh Soundalgekar},
   title        = {Fast and accurate text classification via multiple linear 
                   discriminant projections},
   booktitle    = {Proceedings of VLDB-02, 28th International Conference on Very 
                   Large Data Bases},
   publisher    = {},
   editor       = {},
   year         = {2002},
   address      = {Hong Kong, {CN}},
   pages        = {658--669},
   url          = {http://www.vldb.org/conf/2002/S19P01.pdf},
   abstract     = {Support vector machines (SVMs) have shown superb performance for 
                   text classification tasks. They are accurate, robust, and quick 
                   to apply to test instances. Their only potential drawback is 
                   their training time and memory requirement. For n training 
                   instances held in memory, the best-known SVM implementations take 
                   time proportional to n a , where a is typically between 1.8 and 
                   2.1. SVMs have been trained on data sets with several thousand 
                   instances, but Web directories today contain millions of 
                   instances which are valuable for mapping billions of Web pages 
                   into Yahoo!-like directories. We present SIMPL, a nearly 
                   linear-time classification algorithm which mimics the strengths 
                   of SVMs while avoiding the training bottleneck. It uses Fisher's 
                   linear discriminant, a classical tool from statistical pattern 
                   recognition, to project training instances to a carefully 
                   selected low-dimensional subspace before inducing a decision tree 
                   on the projected instances. SIMPL uses efficient sequential scans 
                   and sorts, and is comparable in speed and memory scalability to 
                   widely-used naive Bayes (NB) classifiers, but it beats NB 
                   accuracy decisively. It not only approaches and sometimes exceeds 
                   SVM accuracy, but also beats SVM running time by orders of 
                   magnitude. While developing SIMPL, we also make a detailed 
                   experimental analysis of the cache performance of SVMs.},
}
@inProceedings{Chai02,
   author       = {Kian M. Chai and Hwee T. Ng and Hai L. Chieu},
   title        = {Bayesian online classifiers for text classification and filtering},
   booktitle    = {Proceedings of SIGIR-02, 25th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng 
                   and Kalervo J{\"{a}}rvelin},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Tampere, {FI}},
   year         = {2002},
   pages        = {97--104},
   url          = {http://doi.acm.org/10.1145/564376.564395},
   abstract     = {This paper explores the use of Bayesian online classifiers to 
                   classify text documents. Empirical results indicate that these 
                   classifiers are comparable with the best text classification 
                   systems. Furthermore, the online approach offers the advantage of 
                   continuous learning in the batch-adaptive text filtering task.},
}
@inProceedings{Chandrinos00,
   author       = {Konstantinos V. Chandrinos and Ion Androutsopoulos and Georgios 
                   Paliouras and Constantine D. Spyropoulos},
   title        = {Automatic {W}eb Rating: Filtering Obscene Content on the {W}eb},
   booktitle    = {Proceedings of ECDL-00, 4th European Conference on Research and 
                   Advanced Technology for Digital Libraries},
   editor       = {Jos{\'e} L. Borbinha and Thomas Baker},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1923},
   year         = {2000},
   address      = {Lisbon, {PT}},
   pages        = {403­-406},
   url          = {http://link.springer.de/link/service/series/0558/papers/1923/19230403.pdf},
   abstract     = {We present a method to detect automatically pornographic content 
                   on the Web. Our method combines techniques from language 
                   engineering and image analysis within a machine-learning 
                   framework. Experimental results show that it achieves nearly 
                   perfect performance on a set of hard cases.},
}
@inProceedings{Chen01,
   author       = {Chien Chin Chen and Chang Chen, Meng and Yeali Sun},
   title        = {{PVA}: A Self-Adaptive Personal View Agent},
   booktitle    = {Proceedings of KDD-01, 7th ACM SIGKDD International Conferece on 
                   Knowledge Discovery and Data Mining},
   editor       = {Foster Provost and Ramakrishnan Srikant},
   year         = {2001},
   pages        = {257--262},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {San Francisco, {US}},
   url          = {http://doi.acm.org/10.1145/502512.502548},
   abstract     = {In this paper, we present PVA, an adaptive personal view 
                   information agent system to track, learn and manage, user's 
                   interests in Internet documents. When user's interests change, 
                   PVA, in not only the contents, but also in the structure of user 
                   profile, is modified to adapt to the changes. Experimental 
                   results show that modulating the structure of user profile does 
                   increase the accuracy of personalization systems.},
}
@article{Chen02,
   author       = {Chien Chin Chen and Chang Chen, Meng and Yeali Sun},
   title        = {{PVA}: A Self-Adaptive Personal View Agent},
   journal      = {Journal of Intelligent Information Systems},
   year         = {2002},
   note         = {Special Issue on Automated Text Categorization},
   volume       = {18},
   number       = {2/3},
   pages        = {173--194},
   url          = {http://www.wkap.nl/article.pdf?391245},
   abstract     = {In this paper, we present PVA, an adaptive personal view 
                   information agent system for tracking, learning and managing user 
                   interests in Internet documents. PVA consists of three parts: a 
                   {\it proxy}, {\it personal view constructor}, and {\it personal 
                   view maintainer}. The proxy logs the user's activities and 
                   extracts the user's interests without user intervention. The 
                   personal view constructor mines user interests and maps them to a 
                   class hierarchy (i.e., personal view). The personal view 
                   maintainer synchronizes user interests and the personal view 
                   periodically. When user interests change, in PVA, not only the 
                   contents, but also the structure of the user profile are modified 
                   to adapt to the changes. In addition, PVA considers the aging 
                   problem of user interests. The experimental results show that 
                   modulating the structure of the user profile increases the 
                   accuracy of a personalization system.},
}
@inProceedings{Chen00,
   author       = {Hao Chen and Susan T. Dumais},
   title        = {Bringing order to the {W}eb: automatically categorizing search 
                   results},
   booktitle    = {Proceedings of CHI-00, ACM International Conference on Human 
                   Factors in Computing Systems},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {},
   year         = {2000},
   address      = {Den Haag, {NL}},
   pages        = {145--152},
   url          = {http://www.acm.org/pubs/articles/proceedings/chi/332040/p145-chen/p145-chen.pdf},
   abstract     = {We developed a user interface that organizes Web search results 
                   into hierarchical categories. Text classification algorithms were 
                   used to automatically classify arbitrary search results into an 
                   existing category structure on-the-fly. A user study compared our 
                   new category interface with the typical ranked list interface of 
                   search results. The study showed that the category interface is 
                   superior both in objective and subjective measures. Subjects 
                   liked the category interface much better than the list interface, 
                   and they were 50\% faster at finding information that was 
                   organized into categories. Organizing search results allows users 
                   to focus on items in categories of interest rather than having to 
                   browse through all the results sequentially.},
}
@inProceedings{Chen00a,
   author       = {Hao Chen and Tin Kam Ho},
   title        = {Evaluation of Decision Forests on Text Categorization},
   booktitle    = {Proceedings of the 7th SPIE Conference on Document Recognition 
                   and Retrieval},
   publisher    = {{SPIE} {}-{} The International Society for Optical Engineering},
   editor       = {Daniel P. Lopresti and Jiangying Zhou},
   year         = {2000},
   address      = {San Jose, {US}},
   pages        = {191--199},
   url          = {http://cm.bell-labs.com/who/tkh/papers/textcat.ps.gz},
   abstract     = {Text categorization is useful for indexing documents for 
                   information retrieval, filtering parts for document 
                   understanding, and summarizing contents of documents of special 
                   interests. We describe a text categorization task and an 
                   experiment using documents from the Reuters and OHSUMED 
                   collections. We applied the Decision Forest classifier and 
                   compared its accuracies to those of C4.5 and kNN classifiers, 
                   using both category dependent and category independent term 
                   selection schemes. It is found that Decision Forest outperforms 
                   both C4.5 and kNN in all cases, and that category dependent term 
                   selection yields better accuracies. Performances of all three 
                   classifiers degrade from the Reuters collection to the OHSUMED 
                   collection, but Decision Forest remains to be superior.},
}
@inProceedings{Cheng01,
   author       = {Cheng, Chun-Hung and Jian Tang and Ada Wai-Chee and Irwin King},
   title        = {Hierarchical Classification of Documents with Error Control},
   booktitle    = {Proceedings of PAKDD-01, 5th Pacific-Asia Conferenece on 
                   Knowledge Discovery and Data Mining},
   editor       = {David Cheung and Qing Li and Graham Williams},
   year         = {2001},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Hong Kong, {CN}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2035},
   pages        = {433--443},
   url          = {http://link.springer-ny.com/link/service/series/0558/papers/2035/20350433.pdf},
   abstract     = {Classification is a function that matches a new object with one 
                   of the predefined classes. Document classification is 
                   characterized by the large number of attributes involved in the 
                   objects (documents). The traditional method of building a single 
                   classifier to do all the classification work would incur a high 
                   overhead. Hierarchical classifi- cation is a more efficient 
                   method - instead of a single classifier, we use a set of 
                   classifiers distributed over a class taxonomy, one for each 
                   internal node. However, once a misclassification occurs at a high 
                   level class, it may result in a class that is far apart from the 
                   correct one. An existing approach to coping with this problem 
                   requires terms also to be arranged hierarchically. In this paper, 
                   instead of overhauling the classifier itself, we propose 
                   mechanisms to detect misclassification and take appropriate 
                   actions. We then discuss an alternative that masks the 
                   misclassification based on a well known software fault tolerance 
                   technique. Our experiments show our algorithms represent a good 
                   trade-off between speed and accuracy in most applications.},
}
@inProceedings{Cheong02,
   author       = {Cheong Fung, Gabriel P. and Jeffrey X. Yu and Hongjun Lu},
   title        = {Discriminative Category Matching: Efficient Text Classification 
                   for Huge Document Collections},
   booktitle    = {Proceedings of ICDM-02, 2nd IEEE International Conference on Data 
                   Mining},
   editor       = {},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   address      = {Maebashi City, {JP}},
   year         = {2002},
   pages        = {187--194},
   url          = {http://dlib.computer.org/conferen/icdm/1754/pdf/17540187.pdf},
   abstract     = {With the rapid growth of textual information available on the 
                   Internet, having a good model for classifying and managing 
                   documents automatically is undoubtly important. When more 
                   documents are archived, new terms, new concepts and concept-drift 
                   will frequently appear. Without a doubt, updating the 
                   classification model frequently rather than using the old model 
                   for a very long period is absolutely essential. Here, the 
                   challenges are: a) obtain a high accuracy classification model; 
                   b) consume low computational time for both model training and 
                   operation; and c) occupy low storage space. However, none of the 
                   existing classification approaches could achieve all of these 
                   requirements. In this paper, we propose a novel text 
                   classification approach, called Discriminative Category Matching, 
                   which could achieve all of the stated characteristics. Extensive 
                   experiments using two benchmarks and a large real-life collection 
                   are conducted. The encouraging results indicated that our 
                   approach is hignhly feasible.},
}
@article{Chouchoulas01,
   author       = {Alexios Chouchoulas and Qiang Shen},
   title        = {Rough set-aided keyword reduction for text categorization},
   journal      = {Applied Artificial Intelligence},
   pages        = {843--873},
   year         = {2001},
   volume       = {15},
   number       = {9},
   url          = {},
   abstract     = {The volume of electronically stored information increases 
                   exponentially as the state of the art progresses. Automated 
                   information filtering (IF) and information retrieval (IR) systems 
                   are therefore acquiring rapidly increasing prominence. However, 
                   such systems sacrifice efficiency to boost effectiveness. Such 
                   systems typically have to cope with sets of rectors of many tens 
                   of thousands of dimensions. Rough set (RS) theory can be applied 
                   to reducing the dimensionality of data used in IF/IR tasks, by 
                   providing a measure of the information content of datasets with 
                   respect to a given classification. This can aid IF/IR systems 
                   that rely on the acquisition of large numbers of term weights or 
                   other measures of relevance. This article investigates the 
                   applicability of RS theory to the IF/IR application domain and 
                   compares this applicability with respect to various existing TC 
                   techniques. The ability, of the approach to generalize, given a 
                   minimum of training data is also addressed. The background of RS 
                   theory is presented, with an illustrative example to demonstrate 
                   the operation of the RS-based dimensionality reduction. A modular 
                   system is proposed which allows the integration of this technique 
                   with a large variety of different IF/IR approaches. The example 
                   application, categorization of E-mail messages, is described. 
                   Systematic experiments and their results are reported and 
                   analyzed.},
}
@inProceedings{Chuang00,
   author       = {Wesley T. Chuang and Asok Tiyyagura and Jihoon Yang and Giovanni 
                   Giuffrida},
   title        = {A Fast Algorithm for Hierarchical Text Classification},
   booktitle    = {Proceedings of DaWaK-00, 2nd International Conference on Data 
                   Warehousing and Knowledge Discovery},
   editor       = {Yahiko Kambayashi and Mukesh Mohania and A.Min Tjoa},
   year         = {2000},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1874},
   address      = {London, {UK}},
   pages        = {409--418},
   url          = {http://www.cs.iastate.edu/~yang/Papers/dawak00.ps},
   abstract     = {Text classification is becoming more important with the 
                   proliferation of the Internet and the huge amount of data it 
                   transfers. We present an efficient algorithm for text 
                   classification using hierarchical classifiers based on a concept 
                   hierarchy. The simple TFIDF classifier is chosen to train sample 
                   data and to classify other new data. Despite its simplicity, 
                   results of experiments on Web pages and TV closed captions 
                   demonstrate high classification accuracy. Application of feature 
                   subset selection techniques improves the performance. Our 
                   algorithm is computationally efficient being bounded by O(n log 
                   n) forn samples.},
}
@inProceedings{Ciravegna99,
   author       = {Fabio Ciravegna and Alberto Lavelli and Nadia Mana and Johannes 
                   Matiasek and Luca Gilardoni and Silvia Mazza and William J. Black 
                   and Fabio Rinaldi},
   title        = {{FACILE}: Classifying Texts Integrating Pattern Matching and 
                   Information Extraction},
   booktitle    = {Proceedings of IJCAI-99, 16th International Joint Conference on 
                   Artificial Intelligence},
   editor       = {Thomas Dean},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   year         = {1999},
   pages        = {890--895},
   address      = {Stockholm, {SE}},
   url          = {http://ecate.itc.it:1024/lavelli/lavelli-papers/IJCAI99/ijcai99.ps.gz},
   abstract     = {Successfully managing information means being able to find 
                   relevant new information and to correctly integrate it with 
                   pre-existing knowledge. Much information is nowadays stored as 
                   multilingual textual data; therefore advanced classification 
                   systems are currently considered as strategic components for 
                   effective knowledge management. We describe an experience 
                   integrating different innovative AI technologies such as 
                   hierarchical pattern matching and information extraction to 
                   provide flexible multilingual classification adaptable to user 
                   needs. Pattern matching produces fairly accurate and fast 
                   categorisation over a large number of classes, while information 
                   extraction provides fine-grained classification for a reduced 
                   number of classes. The resulting system was adopted by the main 
                   Italian financial news agency providing a pay-to-view service.},
}
@inProceedings{Clack97,
   author       = {Chris Clack and Johnny Farringdon and Peter Lidwell and Tina Yu},
   title        = {Autonomous document classification for business},
   editor       = {W. Lewis Johnson},
   publisher    = {{ACM} Press, New York, {US}},
   booktitle    = {Proceedings of the 1st International Conference on Autonomous 
                   Agents},
   address      = {Marina Del Rey, {US}},
   year         = {1997},
   pages        = {201--208},
   url          = {http://www.acm.org/pubs/articles/proceedings/ai/267658/p201-clack/p201-clack.pdf},
   abstract     = {With the continuing exponential growth of the Internet and the 
                   more recent growth of business Intranets, the commercial world is 
                   becoming increasingly aware of the problem of electronic 
                   information overload. This has encouraged interest in developing 
                   agents/softbots that can act as electronic personal assistants 
                   and can develop and adapt representations of users information 
                   needs, commonly known as profiles. As the result of collaborative 
                   research with Friends of the Earth, an environmental issues 
                   campaigning organisation, we have developed a general purpose 
                   information classification agent architecture and have applied it 
                   to the problem of document classification and routing. 
                   Collaboration with Friends of the Earth allows us to test our 
                   ideas in a non-academic context involving high volumes of 
                   documents. We use the technique of genetic programming (GP), 
                   (Koza and Rice 1992), to evolve classifying agents. This is a 
                   novel approach for document classification, where each agent 
                   evolves a parse-tree representation of a user's particular 
                   information need. The other unusual features of our research are 
                   the longevity of our agents and the fact that they undergo a 
                   continual training process; feedback from the user enables the 
                   agent to adapt to the user's long-term information requirements.},
}
@inProceedings{Cohen95,
   author       = {William W. Cohen},
   title        = {Text categorization and relational learning},
   booktitle    = {Proceedings of ICML-95, 12th International Conference on Machine 
                   Learning},
   editor       = {Armand Prieditis and Stuart J. Russell},
   address      = {Lake Tahoe, {US}},
   year         = {1995},
   pages        = {124--132},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www.research.whizbang.com/~wcohen/postscript/ml-95-ir.ps},
   abstract     = {We evaluate the first order learning system FOIL on a series of 
                   text categorization problems. It is shown that FOIL usually forms 
                   classifiers with lower error rates and higher rates of precision 
                   and recall with a relational encoding than with a propositional 
                   encoding. We show that FOIL's performance can be improved by 
                   relation selection, a first order analog of feature selection. 
                   Relation selection improves FOIL's performance as measured by any 
                   of recall, precision, F-measure, or error rate. With an 
                   appropriate level of relation selection, FOIL appears to be 
                   competitive with or superior to existing propositional 
                   techniques.},
}
@inCollection{Cohen95a,
   author       = {William W. Cohen},
   title        = {Learning to classify {E}nglish text with {ILP} methods},
   booktitle    = {Advances in inductive logic programming},
   editor       = {De Raedt, Luc},
   publisher    = {{IOS} Press},
   address      = {Amsterdam, {NL}},
   pages        = {124--143},
   year         = {1995},
   url          = {http://www.research.whizbang.com/~wcohen/postscript/ilp.ps},
   abstract     = {Text categorization is the task of classifying text into one of 
                   several predefined categories. In this paper we will evaluate the 
                   effectiveness of several ILP methods for text categorization, and 
                   also compare them to their propositional analogs. The methods 
                   considered are FOIL, the propositional rule-learning system 
                   RIPPER, and a first-order version of RIPPER called FLIPPER. We 
                   show that the benefit of using a first-order representation in 
                   this domain is relatively modest; in particular, the performance 
                   difference between FLIPPER and FOIL and their propositional 
                   counterparts is quite small, compared to the differences between 
                   FOIL and FLIPPER. However, a first-order representation seems to 
                   be advantageous when high-precision classifiers are desirable.},
}
@inProceedings{Cohen96a,
   author       = {William W. Cohen and Yoram Singer},
   title        = {Context-sensitive learning methods for text categorization},
   booktitle    = {Proceedings of SIGIR-96, 19th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Hans-Peter Frei and Donna Harman and Peter Sch{\"{a}}uble and 
                   Ross Wilkinson},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1996},
   address      = {Z{\"{u}}rich, {CH}},
   pages        = {307--315},
   note         = {An extended version appears as~\cite{Cohen99}},
   url          = {http://www.research.whizbang.com/~wcohen/postscript/sigir-96.ps},
   abstract     = {Two machine learning algorithms, RIPPER and sleeping experts for 
                   phrases, are evaluated on a number of large text categorization 
                   problems. These algorithms both construct classifiers that allow 
                   the ``context'' of a word w to affect how (or even whether) the 
                   presence or absence of w will contribute to a classification. 
                   However, RIPPER and sleeping experts differ radically in many 
                   other respects. Differences include: different notions as to what 
                   constitutes a context; different ways of combining contexts to 
                   construct a classifier; different methods to search for a 
                   combination of contexts; and different criteria as to what 
                   contexts should be included in such a combination. In spite of 
                   these differences, both RIPPER and sleeping experts perform 
                   extremely well across a wide variety of categorization problems, 
                   generally outperforming previously applied learning methods. We 
                   view this result as a confirmation of the usefulness of 
                   classifiers that represent contextual information.},
}
@inProceedings{Cohen98,
   author       = {William W. Cohen and Haym Hirsh},
   title        = {Joins that generalize: text classification using {{\sc Whirl}}},
   booktitle    = {Proceedings of KDD-98, 4th International Conference on Knowledge 
                   Discovery and Data Mining},
   editor       = {Rakesh Agrawal and Paul E. Stolorz and Gregory Piatetsky-Shapiro},
   publisher    = {{AAAI} Press, Menlo Park, {US}},
   year         = {1998},
   address      = {New York, {US}},
   pages        = {169--173},
   url          = {http://www.research.whizbang.com/~wcohen/postscript/kdd-98.ps},
   abstract     = {WHIRL is an extension of relational databases that can perform 
                   ``soft joins'' based on the similarity of textual identifiers; 
                   these soft joins extend the traditional operation of joining 
                   tables based on the equivalence of atomic values. This paper 
                   evaluates WHIRL on a number of inductive classification tasks 
                   using data from the World Wide Web. We show that although WHIRL 
                   is designed for more general similarity-based reasoning tasks, it 
                   is competitive with mature inductive classification systems on 
                   these classification tasks. In particular, WHIRL generally 
                   achieves lower generalization error than C4.5, RIPPER, and 
                   several nearest-neighbor methods. WHIRL is also fast-up to 500 
                   times faster than C4.5 on some benchmark problems. We also show 
                   that WHIRL can be efficiently used to select from a large pool of 
                   unlabeled items those that can be classified correctly with high 
                   confidence.},
}
@article{Cohen99,
   author       = {William W. Cohen and Yoram Singer},
   title        = {Context-sensitive learning methods for text categorization},
   journal      = {{ACM} Transactions on Information Systems},
   year         = {1999},
   volume       = {17},
   number       = {2},
   pages        = {141--173},
   url          = {http://www.acm.org/pubs/articles/journals/tois/1999-17-2/p141-cohen/p141-cohen.pdf},
   abstract     = {Two recently implemented machine-learning algorithms, RIPPER and 
                   sleeping-experts for phrases, are evaluated on a number of large 
                   text categorization problems. These algorithms both construct 
                   classifiers that allow the ``context'' of a word w to affect how 
                   (or even whether) the presence or absence of w will contribute to 
                   a classification. However, RIPPER and sleeping-experts differ 
                   radically in many other respects: differences include different 
                   notions as to what constitutes a context, different ways of 
                   combining contexts to construct a classifier, different methods 
                   to search for a combination of contexts, and different criteria 
                   as to what contexts should be included in such a combination. In 
                   spite of these differences, both RIPPER and sleeping-experts 
                   perform extremely well across a wide variety of categorization 
                   problems, generally outperforming previously applied learning 
                   methods. We view this result as a confirmation of the usefulness 
                   of classifiers that represent contextual information.},
}
@inProceedings{Crammer02,
   author       = {Koby Crammer and Yoram Singer},
   title        = {A New Family of Online Algorithms for Category Ranking},
   booktitle    = {Proceedings of SIGIR-02, 25th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng 
                   and Kalervo J{\"{a}}rvelin},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Tampere, {FI}},
   year         = {2002},
   pages        = {151--158},
   url          = {http://doi.acm.org/10.1145/564376.564404},
   abstract     = {We describe a new family of topic-ranking algorithms for 
                   multi-labeled documents. The motivation for the algorithms stems 
                   from recent advances in online learning algorithms. The 
                   algorithms we present are simple to implement and are time and 
                   memory efficient. We evaluate the algorithms on the Reuters-21578 
                   corpus and the new corpus released by Reuters in 2000. On both 
                   corpora the algorithms we present outperform adaptations to 
                   topic-ranking of Rocchio's algorithm and the Perceptron 
                   algorithm. We also outline the formal analysis of the algorithm 
                   in the mistake bound model. To our knowledge, this work is the 
                   first to report performance results with the entire new Reuters 
                   corpus.},
}
@inProceedings{Craven98,
   author       = {Mark Craven and Dan DiPasquo and Dayne Freitag and Andrew K. 
                   McCallum and Tom M. Mitchell and Kamal Nigam and Se{\'{a}}n 
                   Slattery},
   title        = {Learning to extract symbolic knowledge from the {W}orld {W}ide 
                   {W}eb},
   booktitle    = {Proceedings of AAAI-98, 15th Conference of the American 
                   Association for Artificial Intelligence},
   publisher    = {{AAAI} Press, Menlo Park, {US}},
   year         = {1998},
   pages        = {509--516},
   address      = {Madison, {US}},
   note         = {An extended version appears as~\cite{Craven00}},
   url          = {http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-11/www/wwkb/overview-aaai98.ps.gz},
   abstract     = {The World Wide Web is a vast source of information accessible to 
                   computers, but understandable only to humans. The goal of the 
                   research described here is to automatically create a computer 
                   understandable world wide knowledge base whose content mirrors 
                   that of the World Wide Web. Such a knowledge base would enable 
                   much more effective retrieval of Web information, and promote new 
                   uses of the Web to support knowledge-based inference and problem 
                   solving. Our approach is to develop a trainable information 
                   extraction system that takes two inputs: an ontology defining the 
                   classes and relations of interest, and a set of training data 
                   consisting of labeled regions of hypertext representing instances 
                   of these classes and relations. Given these inputs, the system 
                   learns to extract information from other pages and hyperlinks on 
                   the Web. This paper describes our general approach, several 
                   machine learning algorithms for this task, and promising initial 
                   results with a prototype system.},
}
@article{Craven00,
   author       = {Mark Craven and Dan DiPasquo and Dayne Freitag and Andrew K. 
                   McCallum and Tom M. Mitchell and Kamal Nigam and Se{\'{a}}n 
                   Slattery},
   title        = {Learning to Construct Knowledge Bases from the {W}orld {W}ide 
                   {W}eb},
   journal      = {Artificial Intelligence},
   volume       = {118},
   number       = {1/2},
   year         = {2000},
   pages        = {69--113},
   url          = {http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-11/www/wwkb/overview-aij99.ps.gz},
   abstract     = {The World Wide Web is a vast source of information accessible to 
                   computers, but understandable only to humans. The goal of the 
                   research described here is to automatically create a computer 
                   understandable knowledge base whose content mirrors that of the 
                   World Wide Web. Such a knowledge base would enable much more 
                   effective retrieval of Web information, and promote new uses of 
                   the Web to support knowledge-based inference and problem solving. 
                   Our approach is to develop a trainable information extraction 
                   system that takes two inputs. The first is an ontology that 
                   defines the classes (e.g., company, person, employee, product) 
                   and relations (e.g., employed_by, produced_by) of interest when 
                   creating the knowledge base. The second is a set of training data 
                   consisting of labeled regions of hypertext that represent 
                   instances of these classes and relations. Given these inputs, the 
                   system learns to extract information from other pages and 
                   hyperlinks on the Web. This article describes our general 
                   approach, several machine learning algorithms for this task, and 
                   promising initial results with a prototype system that has 
                   created a knowledge base describing university people, courses, 
                   and research projects.},
}
@article{Craven01,
   author       = {Craven, Mark and Slattery, Se{\'{a}}n},
   title        = {Relational learning with statistical predicate invention: Better 
                   models for hypertext},
   journal      = {Machine Learning},
   pages        = {97--119},
   year         = {2001},
   volume       = {43},
   number       = {1/2},
   url          = {http://www.wkap.nl/article.pdf?321079},
   abstract     = {We present a new approach to learning hypertext classifiers that 
                   combines a statistical text-learning method with a relational 
                   rule learner. This approach is well suited to learning in 
                   hypertext domains because its statistical component allows it to 
                   characterize text in terms of word frequencies, whereas its 
                   relational component is able to describe how neighboring 
                   documents are related to each other by hyperlinks that connect 
                   them. We evaluate our approach by applying it to tasks that 
                   involve learning definitions for (i) classes of pages, (ii) 
                   particular relations that exist between pairs of pages, and (iii) 
                   locating a particular class of information in the internal 
                   structure of pages. Our experiments demonstrate that this new 
                   approach is able to learn more accurate classifiers than either 
                   of its constituent methods alone.},
}
@article{Creecy92,
   author       = {Robert M. Creecy and Brij M. Masand and Stephen J. Smith and 
                   David L. Waltz},
   title        = {Trading {MIPS} and memory for knowledge engineering: classifying 
                   census returns on the {C}onnection {M}achine},
   journal      = {Communications of the {ACM}},
   volume       = {35},
   number       = {8},
   year         = {1992},
   pages        = {48--63},
   url          = {http://www.acm.org/pubs/articles/journals/cacm/1992-35-8/p48-creecy/p48-creecy.pdf},
}
@inProceedings{Cristianini01,
   author       = {Nello Cristianini and John Shawe-Taylor and Huma Lodhi},
   title        = {Latent Semantic Kernels},
   booktitle    = {Proceedings of ICML-01, 18th International Conference on Machine 
                   Learning},
   editor       = {Carla Brodley and Andrea Danyluk},
   address      = {Williams College, {US}},
   year         = {2001},
   pages        = {66--73},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {},
   abstract     = {Kernel methods like Support Vector Machines have successfully 
                   been used for text categorization. A standard choice of kernel 
                   function has been the inner product between the vector-space 
                   representation of two documents, in analogy with classical 
                   information retrieval (IR) approaches. Latent Semantic Indexing 
                   (LSI) has been successfully used for IR purposes, as a technique 
                   for capturing semantic relations between terms and inserting them 
                   into the similarity measure between two documents. One of its 
                   main drawbacks, in IR, is its computational cost. In this paper 
                   we describe how the LSI approach can be implemented in a 
                   kernel-defined feature space. We provide experimental results 
                   demonstrating that the approach can significantly improve 
                   performance, and that it does not impair it.},
}
@inCollection{Cristianini01a,
   author       = {Huma Lodhi and John Shawe-Taylor and Nello Cristianini and 
                   Christopher J. Watkins},
   title        = {Discrete Kernels for Text Categorisation},
   booktitle    = {Advances in Neural Information Processing Systems},
   editor       = {Todd K. Leen and Thomas G. Dietterich and Volker Tresp},
   volume       = {13},
   year         = {2001},
   pages        = {563--569},
   publisher    = {{MIT} Press, Cambridge, {MA}},
   url          = {http://www.support-vector.net/papers/LodhiShawe-TaylorCristianiniWatkins_ps.ps},
   abstract     = {},
}
@article{Cristianini02,
   author       = {Nello Cristianini and John Shawe-Taylor and Huma Lodhi},
   title        = {Latent Semantic Kernels},
   journal      = {Journal of Intelligent Information Systems},
   year         = {2002},
   note         = {Special Issue on Automated Text Categorization},
   volume       = {18},
   number       = {2/3},
   pages        = {127--152},
   url          = {http://www.wkap.nl/article.pdf?391243},
   abstract     = {Kernel methods like Support Vector Machines have successfully 
                   been used for text categorization. A standard choice of kernel 
                   function has been the inner product between the vector-space 
                   representation of two documents, in analogy with classical 
                   information retrieval (IR) approaches. Latent Semantic Indexing 
                   (LSI) has been successfully used for IR purposes as a technique 
                   for capturing semantic relations between terms and inserting them 
                   into the similarity measure between two documents. One of its 
                   main drawbacks, in IR, is its computational cost. In this paper 
                   we describe how the LSI approach can be implemented in a 
                   kernel-defined feature space. We provide experimental results 
                   demonstrating that the approach can significantly improve 
                   performance, and that it does not impair it.},
}
@inProceedings{Dalessio98,
   author       = {Stephen D'Alessio and Keitha Murray and Robert Schiaffino and 
                   Aaron Kershenbaum},
   title        = {Category Levels in Hierarchical Text Categorization},
   booktitle    = {Proceedings of EMNLP-98, 3rd Conference on Empirical Methods in 
                   Natural Language Processing},
   year         = {1998},
   publisher    = {Association for Computational Linguistics, Morristown, {US}},
   editor       = {},
   pages        = {},
   address      = {Granada, {ES}},
   url          = {http://www.iona.edu/cs/FacultyPublications/emnlpf.pdf},
   abstract     = {We consider the problem of assigning level numbers (weights) to 
                   hierarchically organized categories during the process of text 
                   categorization. These levels control the ability of the 
                   categories to attract documents during the categorization 
                   process. The levels are adjusted in order to obtain a balance 
                   between recall and precision for each category. If a category's 
                   recall exceeds its precision, the category is too strong and its 
                   level is reduced. Conversely, a category's level is increased to 
                   strengthen it if its precision exceeds its recall. The 
                   categorization algorithm used is a supervised learning procedure 
                   that uses a linear classifier based on the category levels. We 
                   are given a set of categories, organized hierarchically. We are 
                   also given a training corpus of documents already placed in one 
                   or more categories. From these, we extract vocabulary, words that 
                   appear with high frequency within a given category, 
                   characterizing each subject area. Each node's vocabulary is 
                   filtered and its words assigned weights with respect to the 
                   specific category. Then, test documents are scanned and 
                   categories ranked based on the presence of vocabulary terms. 
                   Documents are assigned to categories based on these rankings. We 
                   demonstrate that precision and recall can be significantly 
                   improved by solving the categorization problem taking hierarchy 
                   into account. Specifically, we show that by adjusting the 
                   category levels in a principled way, that precision can be 
                   significantly improved, from 84\% to 91\%, on the much-studied 
                   Reuters-21578 corpus organized in a three-level hierarchy of 
                   categories.},
}
@inProceedings{Dalessio00,
   author       = {Stephen D'Alessio and Keitha Murray and Robert Schiaffino and 
                   Aaron Kershenbaum},
   title        = {The effect of using Hierarchical classifiers in Text 
                   Categorization},
   booktitle    = {Proceeding of RIAO-00, 6th International Conference ``Recherche 
                   d'Information Assistee par Ordinateur''},
   editor       = {},
   address      = {Paris, {FR}},
   year         = {2000},
   pages        = {302--313},
   url          = {http://www.iona.edu/cs/FacultyPublications/riao2000New.pdf},
   abstract     = {Given a set of categories, with or without a preexisting 
                   hierarchy among them, we consider the problem of assigning 
                   documents to one or more of these categories from the point of 
                   view of a hierarchy with more or less depth. We can choose to 
                   make use of none, part or all of the hierarchical structure to 
                   improve the categorization effectiveness and efficiency. It is 
                   possible to create additional hierarchy among the categories. We 
                   describe a procedure for generating a hierarchy of classifiers 
                   that model the hierarchy structure. We report on computational 
                   experience using this procedure. We show that judicious use of a 
                   hierarchy can significantly improve both the speed and 
                   effectiveness of the categorization process. Using the 
                   Reuters-21578 corpus, we obtain an improvement in running time of 
                   over a factor of three and a 5\% improvement in F-measure.},
}
@article{Dasigi01,
   author       = {Dasigi, Venu and Mann, Reinhold C. and Protopopescu, Vladimir A.},
   title        = {Information fusion for text classification: an experimental 
                   comparison},
   journal      = {Pattern Recognition},
   year         = {2001},
   volume       = {34},
   number       = {12},
   pages        = {2413--2425},
   url          = {},
   abstract     = {This article reports on our experiments and results on the 
                   effectiveness of different feature sets and information fusion 
                   from some combinations of them in classifying free text documents 
                   into a given number of categories. We use different feature sets 
                   and integrate neural network learning into the method. The 
                   feature sets are based on the ``latent semantics'' of a reference 
                   library ‹ a collection of documents adequately representing the 
                   desired concepts. We found that a larger reference library is not 
                   necessarily better. Information fusion almost always gives better 
                   results than the individual constituent feature sets, with 
                   certain combinations doing better than the others.},
}
@inProceedings{Debole03,
   author       = {Franca Debole and Fabrizio Sebastiani},
   title        = {Supervised term weighting for automated text categorization},
   year         = {2003},
   booktitle    = {Proceedings of SAC-03, 18th ACM Symposium on Applied Computing},
   address      = {Melbourne, {US}},
   publisher    = {{ACM} Press, New York, {US}},
   pages        = {784--788},
   url          = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/SAC03b.pdf},
   abstract     = {The construction of a text classifier usually involves (i) a 
                   phase of \emph{term selection}, in which the most relevant terms 
                   for the classification task are identified, (ii) a phase of 
                   \emph{term weighting}, in which document weights for the selected 
                   terms are computed, and (iii) a phase of \emph{classifier 
                   learning}, in which a classifier is generated from the weighted 
                   representations of the training documents. This process involves 
                   an activity of {\em supervised learning}, in which information on 
                   the membership of training documents in categories is used. 
                   Traditionally, supervised learning enters only phases (i) and 
                   (iii). In this paper we propose instead that learning from 
                   training data should also affect phase (ii), i.e.\ that 
                   information on the membership of training documents to categories 
                   be used to determine term weights. We call this idea 
                   \emph{supervised term weighting} (STW). As an example, we propose 
                   a number of ``supervised variants'' of $tfidf$ weighting, 
                   obtained by replacing the $idf$ function with the function that 
                   has been used in phase (i) for term selection. We present 
                   experimental results obtained on the standard 
                   \textsf{Reuters-21578} benchmark with one classifier learning 
                   method (support vector machines), three term selection functions 
                   (information gain, chi-square, and gain ratio), and both local 
                   and global term selection and weighting.},
}
@inProceedings{deBuenaga97,
   author       = {De Buenaga Rodr{\'{\i}}guez, Manuel and G{\'o}mez-Hidalgo, 
                   Jos{\'e} Mar{\'{\i}}a and D{\'{\i}}az-Agudo, Bel{\'e}n},
   title        = {Using {WordNet} to Complement Training Information in Text 
                   Categorization},
   booktitle    = {Proceedings of RANLP-97, 2nd International Conference on Recent 
                   Advances in Natural Language Processing},
   publisher    = {},
   editor       = {Ruslan Milkov and Nicolas Nicolov and Nilokai Nikolov},
   address      = {Tzigov Chark, {BL}},
   pages        = {},
   year         = {1997},
   url          = {http://xxx.unizar.es/ps/cmp-lg/9709007},
   abstract     = {Automatic Text Categorization (TC) is a complex and useful task 
                   for many natural language applications, and is usually performed 
                   through the use of a set of manually classified documents, a 
                   training collection. We suggest the utilization of additional 
                   resources like lexical databases to increase the amount of 
                   information that TC systems make use of, and thus, to improve 
                   their performance. Our approach integrates WordNet information 
                   with two training approaches through the Vector Space Model. The 
                   training approaches we test are the Rocchio (relevance feedback) 
                   and the Widrow-Hoff (machine learning) algorithms. Results 
                   obtained from evaluation show that the integration of WordNet 
                   clearly outperforms training approaches, and that an integrated 
                   technique can effectively address the classification of low 
                   frequency categories.},
}
@inProceedings{deLima98,
   author       = {De Lima, Luciano R. and Laender, Alberto H. and Ribeiro-Neto, 
                   Berthier A.},
   title        = {A hierarchical approach to the automatic categorization of 
                   medical documents},
   booktitle    = {Proceedings of CIKM-98, 7th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {Georges Gardarin and James C. French and Niki Pissinou and Kia 
                   Makki and Luc Bouganim},
   year         = {1998},
   address      = {Bethesda, {US}},
   pages        = {132--139},
   url          = {http://www.acm.org/pubs/articles/proceedings/cikm/288627/p132-de_lima/p132-de_lima.pdf},
   abstract     = {},
}
@article{deVel01,
   author       = {Olivier Y. De Vel and Alison Anderson and Malcolm Corney and 
                   George M. Mohay},
   title        = {Mining Email Content for Author Identification Forensics},
   journal      = {{SIGMOD} Record},
   year         = {2001},
   volume       = {30},
   number       = {4},
   pages        = {55--64},
   url          = {},
   abstract     = {We describe an investigation into e-mail content mining for 
                   author identification, or authorship attribution, for the purpose 
                   of forensic investigation. We focus our discussion on the ability 
                   to discriminate between authors for the case of both aggregated 
                   e-mail topics as well as across different email topics. An 
                   extended set of e-mail document features including structural 
                   characteristics and linguistic patterns were derived and, 
                   together with a Support Vector Machine learning algorithm, were 
                   used for mining the e-mail content. Experiments using a number of 
                   e-mail documents generated by different authors on a set of 
                   topics gave promising results for both aggregated and multi-topic 
                   author categorisation.},
}
@inProceedings{Diaz98,
   author       = {D{\'{\i}}az Esteban, Alberto and De Buenaga Rodr{\'{\i}}guez, 
                   Manuel and Ure{\~n}a L{\'o}pez, L. Alfonso and Garc{\'{\i}}a 
                   Vega, Manuel},
   title        = {Integrating Linguistic Resources in an Uniform Way for Text 
                   Classification Tasks},
   booktitle    = {Proceedings of LREC-98, 1st International Conference on Language 
                   Resources and Evaluation},
   publisher    = {},
   editor       = {Antonio Rubio and Natividad Gallardo and Rosa Castro and Antonio 
                   Tejada},
   address      = {Grenada, {ES}},
   pages        = {1197--1204},
   year         = {1998},
   url          = {http://www.esi.uem.es/laboratorios/sinai/postscripts/lrec98.ps},
   abstract     = {Applications based on automatic text classification tasks, like 
                   text categorization (TC), word sense disambiguation (WSD), text 
                   filtering or routing, monolingual or multilingual information 
                   retrieval, and text summarization could obtain serious 
                   improvements by integrating linguistic resources in the current 
                   methods. We present an approach using the Vector Space Model to 
                   integrate two different kind of resources: a lexical database and 
                   training collections, in text content analysis tasks. The 
                   training approaches we test are the Rocchio (relevance feedback) 
                   and the Widrow-Hoff (machine learning) algorithms and WordNet as 
                   the lexical database. We have delevoped experimental systems for 
                   TC and WSD. Results obtained from evaluation show that the 
                   integration of WordNet can outperform approaches based only on 
                   training.},
}
@article{Diederich03,
   author       = {Diederich, Joachim and Kindermann, J{\"{o}}rg and Leopold, Edda 
                   and Paa{{\ss}}, Gerhard},
   title        = {Authorship Attribution with Support Vector Machines},
   journal      = {Applied Intelligence},
   year         = {2003},
   volume       = {19},
   number       = {1/2},
   pages        = {109--123},
   url          = {http://ipsapp007.kluweronline.com/content/getfile/4504/36/6/abstract.htm
                   },
   abstract     = {In this paper we explore the use of text-mining methods for the 
                   identification of the author of a text. We apply the support 
                   vector machine (SVM) to this problem, as it is able to cope with 
                   half a million of inputs it requires no feature selection and can 
                   process the frequency vector of all words of a text. We performed 
                   a number of experiments with texts from a German newspaper. With 
                   nearly perfect reliability the SVM was able to reject other 
                   authors and detected the target author in 60­80\% of the cases. 
                   In a second experiment, we ignored nouns, verbs and adjectives 
                   and replaced them by grammatical tags and bigrams. This resulted 
                   in slightly reduced performance. Author detection with SVMs on 
                   full word forms was remarkably robust even if the author wrote 
                   about different topics.},
}
@inProceedings{Dorre99,
   author       = {Jochen D{\"o}rre and Peter Gerstl and Roland Seiffert},
   title        = {Text mining: finding nuggets in mountains of textual data},
   booktitle    = {Proceedings of KDD-99, 5th ACM International Conference on 
                   Knowledge Discovery and Data Mining},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {},
   year         = {1999},
   address      = {San Diego, {US}},
   pages        = {398--401},
   url          = {http://www.acm.org/pubs/articles/proceedings/ai/312129/p398-dorre/p398-dorre.pdf},
   abstract     = {Text mining applies the same analytical functions of data mining 
                   to the domain of textual information, relying on sophisticated 
                   text analysis techniques that distill information from free-text 
                   documents. IBM¹s Intelligent Miner for Text provides the 
                   necessary tools to unlock the business information that is 
                   ''trapped'' in email, insurance claims, news feeds, or other 
                   document repositories. It has been successfully applied in 
                   analyzing patent portfolios, customer complaint letters, and even 
                   competitors¹ Web pages. After defining our notion of ``text 
                   mining'', we focus on the differences between text and data 
                   mining and describe in some more detail the unique technologies 
                   that are key to successful text mining.},
}
@inProceedings{Dagan96,
   author       = {Dagan, Ido and Feldman, Ronen and Hirsh, Haym},
   title        = {Keyword-based browsing and analysis of large document sets},
   booktitle    = {Proceedings of SDAIR-96, 5th Annual Symposium on Document 
                   Analysis and Information Retrieval},
   publisher    = {},
   editor       = {},
   year         = {1996},
   address      = {Las Vegas, {US}},
   pages        = {191--207},
   url          = {},
   abstract     = {Knowledge discovery in databases (KDD) focuses on the 
                   computerized exploration of large amounts of data and on the 
                   discovery of interesting patterns within them. While most work on 
                   KDD has been concerned with structured databases, there has been 
                   little work on handling the huge amount of information that is 
                   available only in unstructured textual form. The paper describes 
                   the KDT system for knowledge discovery in texts. It is built on 
                   top of a text-categorization paradigm where text articles are 
                   annotated with keywords organized in a hierarchical structure. 
                   Knowledge discovery is performed by analyzing the co-occurrence 
                   frequencies of keywords from this hierarchy in the various 
                   documents. The authors show how this term-frequency approach 
                   supports a range of KDD operations, providing a general framework 
                   for knowledge discovery and exploration in collections of 
                   unstructured text.},
}
@inProceedings{Dagan97,
   author       = {Ido Dagan and Yael Karov and Dan Roth},
   title        = {Mistake-driven learning in text categorization},
   booktitle    = {Proceedings of EMNLP-97, 2nd Conference on Empirical Methods in 
                   Natural Language Processing},
   publisher    = {Association for Computational Linguistics, Morristown, {US}},
   editor       = {Claire Cardie and Ralph Weischedel},
   year         = {1997},
   address      = {Providence, {US}},
   pages        = {55--63},
   url          = {http://l2r.cs.uiuc.edu/~danr/Papers/categ.ps.gz},
   abstract     = {Learning problems in the text processing domain often map the 
                   text to a space whose dimensions are the measured features of the 
                   text, e.g., its words. Three characteristic properties of this 
                   domain are (a) very high dimensionality, (b) both the learned 
                   concepts and the instances reside very sparsely in the feature 
                   space, and (c) a high variation in the number of active features 
                   in an instance. In this work we study three mistake-driven 
                   learning algorithms for a typical task of this nature - text 
                   categorization. We argue that these algorithms which categorize 
                   documents by learning a linear separator in the feature space 
                   have a few properties that make them ideal for this domain. We 
                   then show that a quantum leap in performance is achieved when we 
                   further modify the algorithms to better address some of the 
                   specific characteristics of the domain. In particular, we 
                   demonstrate (1) how variation in document length can be tolerated 
                   by either normalizing feature weights or by using negative 
                   weights, (2) the positive effect of applying a threshold range in 
                   training, (3) alternatives in considering feature frequency, and 
                   (4) the benefits of discarding features while training. Overall, 
                   we present an algorithm, a variation of Littlestone's Winnow, 
                   which performs significantly better than any other algorithm 
                   tested on this task using a similar feature set.},
}
@article{Damashek95,
   author       = {Marc Damashek},
   title        = {Gauging Similarity with {N}-Grams: Language-Independent 
                   Categorization of Text},
   journal      = {Science},
   year         = {1995},
   volume       = {267},
   number       = {5199},
   pages        = {843--848},
   url          = {},
   abstract     = {A language-independent means of gauging topical similarity in 
                   unrestricted text is described. The method combines information 
                   derived from n-grams (consecutive sequences of n characters) with 
                   a simple vector-space technique that makes sorting, 
                   categorization, and retrieval feasible in a large multilingual 
                   collection of documents. No prior information about document 
                   content or language is required. Context, as it applies to 
                   document similarity, can be accommodated by a well-defined 
                   procedure. When an existing document is used as an exemplar, the 
                   completeness and accuracy with which topically related documents 
                   are retrieved is comparable to that of the best existing systems. 
                   The results of a formal evaluation are discussed, and examples 
                   are given using documents in English and Japanese.},
}
@inProceedings{Denoyer01,
   author       = {Ludovic Denoyer and Hugo Zaragoza and Patrick Gallinari},
   title        = {{HMM}-based Passage Models for Document Classification and 
                   Ranking},
   booktitle    = {Proceedings of ECIR-01, 23rd European Colloquium on Information 
                   Retrieval Research},
   editor       = {},
   year         = {2001},
   address      = {Darmstadt, {DE}},
   publisher    = {},
   pages        = {126--135},
   url          = {http://www-connex.lip6.fr/~denoyer/publications/denoyer-final-ecir01.ps},
   abstract     = {We present an application of Hidden Markov Models to supervised 
                   document classification and ranking. We consider a family of 
                   models that take into account the fact that relevant documents 
                   may contain irrelevant passages; the originality of the model is 
                   that it does not explicitly segment documents but rather 
                   considers all possible segmentations in its final score. This 
                   model generalizes the multinomial Naive Bayes and it is derived 
                   from a more general model for different access tasks. The model 
                   is evaluated on the REUTERS test collection and compared to the 
                   multinomial Naive Bayes model. It is shown to be more robust with 
                   respect to the training set size and to improve the performance 
                   both for ranking and classification, specially for classes with 
                   few training examples.},
}
@inProceedings{Diao00,
   author       = {Yanlei Diao and Hongjun Lu and Dekai Wu},
   title        = {A comparative study of classification-based personal e-mail 
                   filtering},
   booktitle    = {Proceedings of PAKDD-00, 4th Pacific-Asia Conference on Knowledge 
                   Discovery and Data Mining},
   editor       = {Takao Terano and Huan Liu and Arbee L.P. Chen},
   pages        = {408--419},
   year         = {2000},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Kyoto, {JP}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1805},
   url          = {http://www.cs.berkeley.edu/~diaoyl/publications/pakdd00.ps},
   abstract     = {This paper addresses personal E-mail filtering by casting it in 
                   the framework of text classification. Modeled as semi-structured 
                   documents, E-mail messages consist of a set of fields with 
                   predefined semantics and a number of variable length free-text 
                   fields. While most work on classification either concentrates on 
                   structured data or free text, the work in this paper deals with 
                   both of them. To perform classification, a naive Bayesian 
                   classifier was designed and implemented, and a decision tree 
                   based classifier was implemented. The design considerations and 
                   implementation issues are discussed. Using a relatively large 
                   amount of real personal E-mail data, a comprehensive comparative 
                   study was conducted using the two classifiers. The importance of 
                   different features is reported. Results of other issues related 
                   to building an effective personal E-mail classifier are presented 
                   and discussed. It is shown that both classifiers can perform 
                   filtering with reasonable accuracy. While the decision tree based 
                   classifier outperforms the Bayesian classifier when features and 
                   training size are selected optimally for both, a carefully 
                   designed naive Bayesian classifier is more robust.},
}
@article{Doyle65,
   author       = {Lauren B. Doyle},
   title        = {Is automatic classification a reasonable application of 
                   statistical analysis of text?},
   journal      = {Journal of the {ACM}},
   volume       = {12},
   number       = {4},
   year         = {1965},
   pages        = {473--489},
   url          = {http://www.acm.org/pubs/articles/journals/jacm/1965-12-4/p473-doyle/p473-doyle.pdf},
   abstract     = {The statistical approach to the analysis of document collections 
                   and retrieval therefrom has proceeded along two main lines, 
                   associative machine searching and automatic classification. The 
                   former approach has been favored because of the tendency of 
                   people in the computer field to strive for new methods of dealing 
                   with the literature -- methods which do not resemble those of 
                   traditional libraries. But automatic classification study also 
                   has been thriving; some of the reasons for this are discussed. 
                   The crucial question of the quality of automatic classification 
                   is treated at considerable length, and empirical data are 
                   introduced to support the hypothesis that classification quality 
                   improves as more information about each document is used for 
                   input to the classification program. Six nonjudgmental criteria 
                   are used in testing the hypothesis for 100 keyword lists (each 
                   list representing a document) for a series of computer runs in 
                   which the number of words per document is increased progressively 
                   from 12 to 36. Four of the six criteria indicate the hypothesis 
                   holds, and two point to no effect. Previous work of this kind has 
                   been confined to the range of one through eight words per 
                   document. Finally, the future of automatic classification and 
                   some of the practical problems to be faced are outlined.},
}
@article{Drucker99,
   author       = {Harris Drucker and Vladimir Vapnik and Dongui Wu},
   title        = {Support vector machines for spam categorization},
   journal      = {{IEEE} Transactions on Neural Networks},
   year         = {1999},
   number       = {5},
   volume       = {10},
   pages        = {1048--1054},
   url          = {http://www.monmouth.edu/~drucker/SVM_spam_article_compete.PDF},
   abstract     = {We study the use of Support Vector Machines (SVMs) in classifying 
                   email as spam or nonspam by comparing it to three other 
                   classification algorithms: Ripper, Rocchio, and boosting decision 
                   trees. These four algorithms were tested on two different data 
                   sets: one data set where the number of features were constrained 
                   to the 1000 best features and another data set where the 
                   dimensionality was over 7000. SVMs performed best when using 
                   binary features. For both data sets, boosting trees and SVMs had 
                   acceptable test performance in terms of accuracy and speed. 
                   However, SVMs had significantly less training time.},
}
@inProceedings{Dumais98,
   author       = {Susan T. Dumais and John Platt and David Heckerman and Mehran 
                   Sahami},
   title        = {Inductive learning algorithms and representations for text 
                   categorization},
   booktitle    = {Proceedings of CIKM-98, 7th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {Georges Gardarin and James C. French and Niki Pissinou and Kia 
                   Makki and Luc Bouganim},
   year         = {1998},
   address      = {Bethesda, {US}},
   pages        = {148--155},
   url          = {http://robotics.stanford.edu/users/sahami/papers-dir/cikm98.pdf},
   abstract     = {Text categorization ­ the assignment of natural language texts to 
                   one or more predefined categories based on their content ­ is an 
                   important component in many information organization and 
                   management tasks. We compare the effectiveness of five different 
                   automatic learning algorithms for text categorization in terms of 
                   learning speed, real-time classification speed, and 
                   classification accuracy. We also examine training set size, and 
                   alternative document representations. Very accurate text 
                   classifiers can be learned automatically from training examples. 
                   Linear Support Vector Machines (SVMs) are particularly promising 
                   because they are very accurate, quick to train, and quick to 
                   evaluate.},
}
@inProceedings{Dumais00,
   author       = {Susan T. Dumais and Hao Chen},
   title        = {Hierarchical classification of {W}eb content},
   booktitle    = {Proceedings of SIGIR-00, 23rd ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Athens, {GR}},
   year         = {2000},
   pages        = {256--263},
   url          = {http://research.microsoft.com/~sdumais/sigir00.pdf},
   abstract     = {This paper explores the use of hierarchical structure for 
                   classifying a large, heterogeneous collection of web content. The 
                   hierarchical structure is initially used to train different 
                   second-level classifiers. In the hierarchical case, a model is 
                   learned to distinguish a second-level category from other 
                   categories within the same top level. In the flat 
                   non-hierarchical case, a model distinguishes a second-level 
                   category from all other second-level categories. Scoring rules 
                   can further take advantage of the hierarchy by considering only 
                   second-level categories that exceed a threshold at the top level. 
                   We use support vector machine (SVM) classifiers, which have been 
                   shown to be efficient and effective for classification, but not 
                   previously explored in the context of hierarchical 
                   classification. We found small advantages in accuracy for 
                   hierarchical models over flat models. For the hierarchical 
                   approach, we found the same accuracy using a sequential Boolean 
                   decision rule and a multiplicative decision rule. Since the 
                   sequential approach is much more efficient, requiring only 
                   14\%-16\% of the comparisons used in the other approaches, we 
                   find it to be a good choice for classifying text into large 
                   hierarchical structures.},
}
@inProceedings{ElYaniv01,
   author       = {Ran El-Yaniv and Oren Souroujon},
   title        = {Iterative Double Clustering for Unsupervised and Semi-supervised 
                   Learning},
   booktitle    = {Proceedings of ECML-01, 12th European Conference on Machine 
                   Learning},
   editor       = {Luc De Raedt and Peter A. Flach},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Freiburg, {DE}},
   year         = {2001},
   pages        = {121--132},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2167},
   url          = {http://link.springer.de/link/service/series/0558/papers/2167/21670121.pdf},
   abstract     = {This paper studies the Iterative Double Clustering (IDC) 
                   meta-clustering algorithm, a new extension of the recent Double 
                   Clustering (DC) method of Slonim and Tishby that exhibited 
                   impressive performance on text categorization tasks. Using 
                   synthetically generated data we empirically demonstrate that 
                   whenever the DC procedure is successful in recovering some of the 
                   structure hidden in the data, the extended IDC procedure can 
                   incrementally compute a dramatically better classification, with 
                   minor additional computational resources. We demonstrate that the 
                   IDC algorithm is especially advantageous when the data exhibits 
                   high attribute noise. Our simulation results also show the 
                   effectiveness of IDC in text categorization problems. 
                   Surprisingly, this unsupervised procedure can be competitive with 
                   a (supervised) SVM trained with a small training set. Finally, we 
                   propose a natural extension of IDC for (semi-supervised) 
                   transductive learning where we are given both labeled and 
                   unlabeled examples, and present preliminary empirical results 
                   showing the plausibility of the extended method in a 
                   semi-supervised setting.},
}
@inProceedings{Escudero00,
   author       = {Gerard Escudero and Llu{\'{\i}}s M{\`{a}}rquez and German Rigau},
   title        = {Boosting applied to word sense disambiguation},
   booktitle    = {Proceedings of ECML-00, 11th European Conference on Machine 
                   Learning},
   editor       = {Ramon L{\'{o}}pez De M{\'{a}}ntaras and Enric Plaza},
   address      = {Barcelona, {ES}},
   pages        = {129--141},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1810},
   year         = {2000},
   url          = {http://www.lsi.upc.es/~escudero/recerca/ecml00.pdf},
   abstract     = {In this paper Schapire and Singer's AdaBoost.MH boosting 
                   algorithm is applied to the Word Sense Disambiguation (WSD) 
                   problem. Initial experiments on a set of 15 selected polysemous 
                   words show that the boosting approach surpasses Naive Bayes and 
                   Exemplar-based approaches, which represent state-of-the-art 
                   accuracy on supervised WSD. In order to make boosting practical 
                   for a real learning domain of thousands of words, several ways of 
                   accelerating the algorithm by reducing the feature space are 
                   studied. The best variant, which we call LazyBoosting, is tested 
                   on the largest sense-tagged corpus available containing 192,800 
                   examples of the 191 most frequent and ambiguous English words. 
                   Again, boosting compares favourably to the other benchmark 
                   algorithms.},
}
@article{Fall03,
   author       = {C. J. Fall and A. T{\"o}rcsv{\'a}ri and K. Benzineb and G. 
                   Karetka},
   title        = {Automated Categorization in the International Patent 
                   Classification},
   journal      = {{SIGIR} Forum},
   year         = {2003},
   pages        = {},
   volume       = {37},
   number       = {1},
   url          = {http://www.acm.org/sigir/forum/S2003/CJF_Manuscript_sigir.pdf},
   abstract     = {A new reference collection of patent documents for training and 
                   testing automated categorization systems is established and 
                   described in detail. This collection is tailored for automating 
                   the attribution of international patent classification codes to 
                   patent applications and is made publicly available for future 
                   research work. We report the results of applying a variety of 
                   machine learning algorithms to the automated categorization of 
                   English-language patent documents. This procedure involves a 
                   complex hierarchical taxonomy, within which we classify documents 
                   into 114 classes and 451 subclasses. Several measures of 
                   categorization success are described and evaluated. We 
                   investigate how best to resolve the training problems related to 
                   the attribution of multiple classification codes to each patent 
                   document.},
}
@inProceedings{Fangmeyer68,
   author       = {Hermann Fangmeyer and Gerhard Lustig},
   title        = {The {EURATOM} automatic indexing project},
   booktitle    = {Proceedings of the IFIP Congress (Booklet J)},
   publisher    = {},
   editor       = {},
   year         = {1968},
   address      = {Edinburgh, {UK}},
   pages        = {66--70},
   url          = {},
   abstract     = {},
}
@inProceedings{Fangmeyer70,
   author       = {Hermann Fangmeyer and Gerhard Lustig},
   title        = {Experiments with the {CETIS} automated indexing system},
   booktitle    = {Proceedings of the Symposium on the Handling of Nuclear 
                   Information},
   publisher    = {International Atomic Energy Agency},
   editor       = {},
   year         = {1970},
   address      = {},
   pages        = {557--567},
   url          = {},
   abstract     = {},
}
@inProceedings{Ferilli01,
   author       = {Stefano Ferilli and Nicola Fanizzi and Gianni Semeraro},
   title        = {Learning logic models for automated text categorization},
   booktitle    = {Proceedings of AI*IA-01, 7th Congress of the Italian Association 
                   for Artificial Intelligence},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2175},
   editor       = {Floriana Esposito},
   year         = {2001},
   pages        = {81--86},
   address      = {Bari, {IT}},
   url          = {http://link.springer.de/link/service/series/0558/papers/2175/21750081.pdf},
   abstract     = {This work addresses a logical approach to text categorization 
                   inside a framework aimed at full automatic paper document 
                   processing. The logic representation of sentences required by the 
                   adopted learning algorithm is obtained by detecting structure in 
                   raw text trough a parser. A preliminary experimentation proved 
                   that the logic approach is able to capture the semantics 
                   underlying some kind of sentences, even if the assessment of the 
                   efficiency of such a method, as well as a comparison with other 
                   related approaches, has still to be carried out.},
}
@article{Field75,
   author       = {B.J. Field},
   title        = {Towards automatic indexing: automatic assignment of 
                   controlled-language indexing and classification from free 
                   indexing},
   year         = {1975},
   journal      = {Journal of Documentation},
   volume       = {31},
   number       = {4},
   pages        = {246--265},
   url          = {},
   abstract     = {},
}
@inProceedings{Finn02,
   author       = {Aidan Finn and Nicholas Kushmerick and Barry Smyth},
   title        = {Genre Classification and Domain Transfer for Information 
                   Filtering},
   booktitle    = {Proceedings of ECIR-02, 24th European Colloquium on Information 
                   Retrieval Research},
   editor       = {Fabio Crestani and Mark Girolami and Cornelis J. Van Rijsbergen},
   year         = {2002},
   address      = {Glasgow, {UK}},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2291},
   pages        = {353--362},
   url          = {http://www.cs.ucd.ie/staff/nick/home/research/download/finn-ecir2002.ps.gz},
   abstract     = {The World Wide Web is a vast repository of information, but the 
                   sheer volume makes it difficult to identify useful documents. We 
                   identify document genre is an important factor in retrieving 
                   useful documents and focus on the novel document genre dimension 
                   of subjectivity. We investigate three approaches to automatically 
                   classifying documents by genre: traditional bag of words 
                   techniques, part-of-speech statistics, and hand-crafted shallow 
                   linguistic features. We are particularly interested in domain 
                   transfer: how well the learned classifiers generalize from the 
                   training corpus to a new document corpus. Our experiments 
                   demonstrate that the part-of-speech approach is better than 
                   traditional bag of words techniques, particularly in the domain 
                   transfer conditions.},
}
@inProceedings{Fisher03,
   author       = {Michelle Fisher and Richard Everson},
   title        = {When are links useful? Experiments in text classification},
   booktitle    = {Proceedings of ECIR-03, 25th European Conference on Information 
                   Retrieval},
   publisher    = {Springer Verlag},
   editor       = {Fabrizio Sebastiani},
   address      = {Pisa, {IT}},
   year         = {2003},
   pages        = {41--56},
   url          = {http://link.springer.de/link/service/series/0558/papers/2633/26330041.pdf},
   abstract     = {Link analysis methods have become popular for information access 
                   tasks, especially information retrieval, where the link 
                   information in a document collection is used to complement the 
                   traditionally used content information. However, there has been 
                   little firm evidence to confirm the utility of link information. 
                   We show that link information can be useful when the document 
                   collection has a sufficiently high link density and links are of 
                   sufficiently high quality. We report experiments on text 
                   classification of the Cora and WebKB data sets using 
                   Probabilistic Latent Semantic Analysis and Probabilistic 
                   Hypertext Induced Topic Selection. Comparison with manually 
                   assigned classes shows that link information enhances 
                   classification in data with sufficiently high link density, but 
                   is detrimental to performance at low link densities or if the 
                   quality of the links is degraded. We introduce a new 
                   frequency-based method for selecting the most useful citations 
                   from a document collection for use in the model.},
}
@inCollection{Forsyth99,
   author       = {Richard S. Forsyth},
   title        = {New directions in text categorization},
   editor       = {Alex Gammerman},
   booktitle    = {Causal models and intelligent data management},
   publisher    = {Springer Verlag},
   address      = {Heidelberg, {DE}},
   year         = {1999},
   pages        = {151--185},
   url          = {},
   abstract     = {},
}
@inProceedings{Frank00,
   author       = {Eibe Frank and Chang Chui and Ian H. Witten},
   title        = {Text Categorization Using Compression Models},
   booktitle    = {Proceedings of DCC-00, IEEE Data Compression Conference},
   editor       = {Storer, James A. and Cohn, Martin},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   year         = {2000},
   address      = {Snowbird, {US}},
   pages        = {200--209},
   url          = {http://dlib.computer.org/conferen/dcc/0592/pdf/05920555.pdf},
   abstract     = {Text categorization is the assignment of natural language texts 
                   to predefined categories based on their content. It has often 
                   been observed that compression seems to provide a very promising 
                   approach to categorization. The overall compression of an article 
                   with respect to different models can be compared to see which one 
                   it fits most closely. Such a scheme has several potential 
                   advantages because it does not require any pre-processing of the 
                   input text. We have performed extensive experiments on the use of 
                   PPM compression models for categorization using the standard 
                   Reuters-21578 dataset. We obtained some encouraging results on 
                   two-category situations, and the results on the general problem 
                   seem reasonably impressive---in one case outstanding. However, we 
                   find that PPM does not compete with the published state of the 
                   art in the use of machine learning for text categorization. It 
                   produces inferior results because it is insensitive to subtle 
                   differences between articles that belong to a category and those 
                   that do not. We do not believe our results are specific to PPM. 
                   If the occurrence of a single word determines whether an article 
                   belongs to a category or not (and it often does) any compression 
                   scheme will likely fail to classify the article correctly. 
                   Machine learning schemes fare better because they automatically 
                   eliminate irrelevant features and concentrate on the most 
                   discriminating ones.},
}
@inProceedings{Frasconi01,
   author       = {Paolo Frasconi and Giovanni Soda and Alessandro Vullo},
   title        = {Text Categorization for Multi-page Documents: A Hybrid Naive 
                   {Bayes HMM} Approach},
   booktitle    = {Proceedings of JCDL, 1st ACM-IEEE Joint Conference on Digital 
                   Libraries},
   editor       = {},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   year         = {2001},
   address      = {Roanoke, {US}},
   pages        = {11--20},
   url          = {http://www.dsi.unifi.it/~paolo/ps/jcdl01-hmm-text.pdf},
   abstract     = {Text categorization is typically formulated as a concept learning 
                   problem where each instance is a single isolated document. In 
                   this paper we are interested in a more general formulation where 
                   documents are organized as page sequences, as naturally occurring 
                   in digital libraries of scanned books and magazines. We describe 
                   a method for classifying pages of sequential OCR text documents 
                   into one of several assigned categories and suggest that taking 
                   into account contextual information provided by the whole page 
                   sequence can significantly improve classification accuracy. The 
                   proposed architecture relies on hidden Markov models whose 
                   emissions are bag-of-words according to a multinomial word event 
                   model, as in the generative portion of the Naive Bayes 
                   classifier. Our results on a collection of scanned journals from 
                   the Making of America project confirm the importance of using 
                   whole page sequences. Empirical evaluation indicates that the 
                   error rate (as obtained by running a plain Naive Bayes classifier 
                   on isolated page) can be roughly reduced by half if contextual 
                   information is incorporated.},
}
@article{Frasconi02,
   author       = {Paolo Frasconi and Giovanni Soda and Alessandro Vullo},
   title        = {Text Categorization for Multi-page Documents: A Hybrid Naive 
                   {Bayes HMM} Approach},
   journal      = {Journal of Intelligent Information Systems},
   year         = {2002},
   note         = {Special Issue on Automated Text Categorization},
   volume       = {18},
   number       = {2/3},
   pages        = {195--217},
   url          = {http://www.wkap.nl/article.pdf?391247},
   abstract     = {In the traditional setting, text categorization is formulated as 
                   a concept learning problem where each instance is a single 
                   isolated document. However, this perspective is not appropriate 
                   in the case of many digital libraries that offer as contents 
                   scanned and optically read books or magazines. In this paper, we 
                   propose a more general formulation of text categorization, 
                   allowing documents to be organized as \textit{sequences} of 
                   pages. We introduce a novel hybrid system specifically designed 
                   for multi-page text documents. The architecture relies on hidden 
                   Markov models whose emissions are bag-of-words resulting from a 
                   multinomial word event model, as in the generative portion of the 
                   Naive Bayes classifier. The rationale behind our proposal is that 
                   taking into account contextual information provided by the whole 
                   page sequence can help disambiguation and improves single page 
                   classification accuracy. Our results on two datasets of scanned 
                   journals from the Making of America collection confirm the 
                   importance of using whole page sequences. The empirical 
                   evaluation indicates that the error rate (as obtained by running 
                   the Naive Bayes classifier on isolated pages) can be 
                   significantly reduced if contextual information is incorporated.},
}
@inProceedings{Frommholz01,
   author       = {Ingo Frommholz},
   title        = {Categorizing {W}eb Documents in Hierarchical Catalogues},
   booktitle    = {Proceedings of ECIR-01, 23rd European Colloquium on Information 
                   Retrieval Research},
   editor       = {},
   year         = {2001},
   address      = {Darmstadt, {DE}},
   publisher    = {},
   pages        = {},
   url          = {http://ls6-www.informatik.uni-dortmund.de/bib/fulltext/ir/Frommholz:01a.pdf},
   abstract     = {Automatic categorization of web documents (e.g. HTML documents) 
                   denotes the task of automatically finding relevant categories for 
                   a (new) document which is to be inserted into a web catalogue 
                   like Yahoo!. There exist many approaches for performing this 
                   difficult task. Here, special kinds of web catalogues, those 
                   whose category scheme is hierarchically ordered, are regarded. A 
                   method for using the knowledge about the hierarchy to gain better 
                   categorization results is discussed. This method can be applied 
                   in a post-processing step and therefore be combined with other 
                   known (non-hierarchical) categorization approaches.},
}
@inProceedings{Fuhr84,
   author       = {Fuhr, Norbert and Knorz, Gerhard},
   title        = {Retrieval test evaluation of a rule-based automated indexing 
                   {(AIR/PHYS)}},
   booktitle    = {Proceedings of SIGIR-84, 7th ACM International Conference on 
                   Research and Development in Information Retrieval},
   year         = {1984},
   publisher    = {Cambridge University Press},
   editor       = {Cornelis J. Van Rijsbergen},
   pages        = {391--408},
   address      = {Cambridge, {UK}},
   url          = {},
   abstract     = {},
}
@inProceedings{Fuhr85,
   author       = {Fuhr, Norbert},
   title        = {A probabilistic model of dictionary-based automatic indexing},
   booktitle    = {Proceedings of RIAO-85, 1st International Conference ``Recherche 
                   d'Information Assistee par Ordinateur''},
   publisher    = {},
   editor       = {},
   address      = {Grenoble, {FR}},
   year         = {1985},
   pages        = {207--216},
   url          = {},
   abstract     = {},
}
@inProceedings{Fuhr91a,
   author       = {Fuhr, Norbert and Hartmann, Stephan and Knorz, Gerhard and 
                   Lustig, Gerhard and Schwantner, Michael and Tzeras, Konstadinos},
   title        = {{AIR/X} -- a Rule-Based Multistage Indexing System for Large 
                   Subject Fields},
   booktitle    = {Proceedings of RIAO-91, 3rd International Conference ``Recherche 
                   d'Information Assistee par Ordinateur''},
   publisher    = {Elsevier Science Publishers, Amsterdam, {NL}},
   editor       = {Andr{\'e} Lichnerowicz},
   address      = {Barcelona, {ES}},
   year         = {1991},
   pages        = {606--623},
   url          = {http://www.darmstadt.gmd.de/~tzeras/FullPapers/gz/Fuhr-etal-91.ps.gz},
   abstract     = {AIR/X is a rule-based system for indexing with terms 
                   (descriptors) from a prescribed vocabulary. For this task, an 
                   indexing dictionary with rules for mapping terms from the text 
                   onto descriptors is required, which can be derived automatically 
                   from a set of manually indexed documents. Based on the Darmstadt 
                   Indexing Approach, the indexing task is divided into a 
                   description step and a decision step. First, terms (single words 
                   or phrases) are identified in the document text. With 
                   term-descriptor rules from the dictionary, descriptor indications 
                   are formed. The set of all indications from a document leading to 
                   the same descriptor is called a relevance description. A 
                   probabilistic classification procedure computes indexing weights 
                   for each relevance description. Since the whole system is 
                   rule-based, it can be adapted to different subject fields by 
                   appropriate modifications of the rule bases. A major application 
                   of AIR/X is the AIR/PHYS system developed for a large physics 
                   database. This application is described in more detail along with 
                   experimental results.},
}
@inProceedings{Fuhr91b,
   author       = {Norbert Fuhr and Ulrich Pfeifer},
   title        = {Combining Model-Oriented and Description-Oriented Approaches for 
                   Probabilistic Indexing},
   booktitle    = {Proceedings of SIGIR-91, 14th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Abraham Bookstein and Yves Chiaramella and Gerard Salton and 
                   Vijay V. Raghavan},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Chicago, {US}},
   pages        = {46--56},
   year         = {1991},
   note         = {An extended version appears as~\cite{Fuhr94}},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/122860/p46-fuhr/p46-fuhr.pdf},
   abstract     = {We distinguish model-oriented and description-oriented approaches 
                   in probabilistic information retrieval. The former refer to 
                   certain representations of documents and queries and use 
                   additional independence assumptions, whereas the latter map 
                   documents and queries onto feature vectors which form the input 
                   to certain classification procedures or regression methods. 
                   Description-oriented approaches are more flexible with respect to 
                   the underlying representations, but the definition of the feature 
                   vector is a heuristic step. In this paper, we combine a 
                   probabilistic model for the Darmstadt Indexing Approach with 
                   logistic regression. Here the probabilistic model forms a 
                   guideline for the definition of the feature vector. Experiments 
                   with the purely theoretical approach and with several heuristic 
                   variations show that heuristic assumptions may yield significant 
                   improvements.},
}
@article{Fuhr94,
   author       = {Norbert Fuhr and Ulrich Pfeifer},
   title        = {Probabilistic Information Retrieval as Combination of Abstraction 
                   Inductive Learning and Probabilistic Assumptions},
   journal      = {{ACM} Transactions on Information Systems},
   year         = {1994},
   number       = {1},
   volume       = {12},
   pages        = {92-115},
   url          = {http://ls6-www.informatik.uni-dortmund.de/bib/fulltext/ir/Fuhr_Pfeifer:94.ps.gz},
   abstract     = {We show that former approaches in probabilistic information 
                   retrieval are based on one or two of the three concepts 
                   abstraction, inductive learning and probabilistic assumptions, 
                   and we propose a new approach which combines all three concepts. 
                   This approach is illustrated for the case of indexing with a 
                   controlled vocabulary. For this purpose, we describe a new 
                   probabilistic model #rst, which is then combined with logistic 
                   regression, thus yielding a generalization of the original model. 
                   Experimental results for the pure theoretical model as well as 
                   for heuristic variants are given. Furthermore, linear and 
                   logistic regression are compared.},
}
@inProceedings{Furnkranz99,
   author       = {Johannes F{\"{u}}rnkranz},
   title        = {Exploiting Structural Information for Text Classification on the 
                   {WWW}},
   booktitle    = {Proceedings of IDA-99, 3rd Symposium on Intelligent Data Analysis},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1642},
   editor       = {David J. Hand and Joost N. Kok and Michael R. Berthold},
   address      = {Amsterdam, {NL}},
   year         = {1999},
   pages        = {487--497},
   url          = {http://www.ai.univie.ac.at/~juffi/publications/ida-99.ps.gz},
   abstract     = {In this paper, we report on a set of experiments that explore the 
                   utility of making use of the structural information of WWW 
                   documents. Our working hypothesis is that it is often easier to 
                   classify a hypertext page using information provided on pages 
                   that point to it instead of using information that is provided on 
                   the page itself. We present experimental evidence that confirms 
                   this hypothesis on a set of Web pages that relate to computer 
                   science departments.},
}
@article{Furnkranz02,
   author       = {Johannes F{\"{u}}rnkranz},
   title        = {Hyperlink Ensembles: A Case Study in Hypertext Classification},
   journal      = {Information Fusion},
   year         = {2002},
   number       = {4},
   volume       = {3},
   pages        = {299--312},
   url          = {},
   abstract     = {In this paper, we introduce hyperlink ensembles, a novel type of 
                   ensemble classifier for classifying hypertext documents. Instead 
                   of using the text on a page for deriving features that can be 
                   used for training a classifier, we suggest to use portions of 
                   texts from all pages that point to the target page. A hyperlink 
                   ensemble is formed by obtaining one prediction for each hyperlink 
                   that points to a page. These individual predictions for each 
                   hyperlink are subsequently combined to a final prediction for the 
                   class of the target page. We explore four different ways of 
                   combining the individual predictions and four different 
                   techniques for identifying relevant text portions. The utility of 
                   our approach is demonstrated on a set of Web-pages that relate to 
                   Computer Science Departments.},
}
@inProceedings{Galavotti00,
   author       = {Luigi Galavotti and Fabrizio Sebastiani and Maria Simi},
   title        = {Experiments on the use of feature selection and negative evidence 
                   in automated text categorization},
   booktitle    = {Proceedings of ECDL-00, 4th European Conference on Research and 
                   Advanced Technology for Digital Libraries},
   editor       = {Jos{\'e} L. Borbinha and Thomas Baker},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1923},
   year         = {2000},
   address      = {Lisbon, {PT}},
   pages        = {59--68},
   url          = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/ECDL00.pdf},
   abstract     = {We tackle two different problems of {\em text categorization} 
                   (TC), namely feature selection and classifier induction. {\em 
                   Feature selection} (FS) refers to the activity of selecting, from 
                   the set of $r$ distinct features (i.e.\ words) occurring in the 
                   collection, the subset of $r'\ll r$ features that are most useful 
                   for compactly representing the meaning of the documents. We 
                   propose a novel FS technique, based on a simplified variant of 
                   the $\chi^2$ statistics. {\em Classifier induction} refers 
                   instead to the problem of automatically building a text 
                   classifier by learning from a set of documents pre-classified 
                   under the categories of interest. We propose a novel variant, 
                   based on the exploitation of negative evidence, of the well-known 
                   $k$-NN method. We report the results of systematic 
                   experimentation of these two methods performed on the standard 
                   {\sc Reuters-21578} benchmark.},
}
@article{Gale93,
   author       = {William A. Gale and Kenneth W. Church and David Yarowsky},
   title        = {A method for disambiguating word senses in a large corpus},
   journal      = {Computers and the Humanities},
   year         = {1993},
   number       = {5},
   volume       = {26},
   pages        = {415--439},
   url          = {http://www.research.att.com/~kwc/published_1993_sense.ps},
   abstract     = {},
}
@inProceedings{Gao03,
   author       = {Sheng Gao and Wen Wu and Chin-Hui Lee and Tat-Seng Chua},
   title        = {A maximal figure-of-merit learning approach to text 
                   categorization},
   booktitle    = {Proceedings of SIGIR-03, 26th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Jamie Callan and Gordon Cormack and Charles Clarke and David 
                   Hawking and Alan Smeaton},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Toronto, {CA}},
   year         = {2003},
   pages        = {174--181},
   url          = {http://doi.acm.org/10.1145/860435.860469},
   abstract     = {A novel maximal figure-of-merit (MFoM) learning approach to text 
                   categorization is proposed. Different from the conventional 
                   techniques, the proposed MFoM method attempts to integrate any 
                   performance metric of interest (e.g. accuracy, recall, precision, 
                   or F1 measure) into the design of any classifier. The 
                   corresponding classifier parameters are learned by optimizing an 
                   overall objective function of interest. To solve this highly 
                   nonlinear optimization problem, we use a generalized 
                   probabilistic descent algorithm. The MFoM learning framework is 
                   evaluated on the Reuters-21578 task with LSI-based feature 
                   extraction and a binary tree classifier. Experimental results 
                   indicate that the MFoM classifier gives improved F1 and enhanced 
                   robustness over the conventional one. It also outperforms the 
                   popular SVM method in micro-averaging F1. Other extensions to 
                   design discriminative multiple-category MFoM classifiers for 
                   application scenarios with new performance metrics could be 
                   envisioned too.},
}
@inProceedings{Gaussier02,
   author       = {{\'{E}}ric Gaussier and Cyril Goutte and Kris Popat and Francine 
                   Chen},
   title        = {A hierarchical model for clustering and categorising documents},
   booktitle    = {Proceedings of ECIR-02, 24th European Colloquium on Information 
                   Retrieval Research},
   editor       = {Fabio Crestani and Mark Girolami and Cornelis J. Van Rijsbergen},
   year         = {2002},
   address      = {Glasgow, {UK}},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2291},
   pages        = {229--247},
   url          = {http://link.springer.de/link/service/series/0558/papers/2291/22910229.pdf},
   abstract     = {We propose a new hierarchical generative model for textual data, 
                   where words may be generated by topic specific distributions at 
                   any level in the hierarchy. This model is naturally well-suited 
                   to clustering documents in preset or automatically generated 
                   hierarchies, as well as categorising new documents in an existing 
                   hierarchy. Training algorithms are derived for both cases, and 
                   illustrated on real data by clustering news stories and 
                   categorising newsgroup messages. Finally, the generative model 
                   may be used to derive a Fisher kernel expressing similarity 
                   between documents.},
}
@article{Gentili01,
   author       = {G.L. Gentili and Mauro Marinilli and Alessandro Micarelli and 
                   Filippo Sciarrone},
   title        = {Text categorization in an intelligent agent for filtering 
                   information on the {W}eb},
   journal      = {International Journal of Pattern Recognition and Artificial 
                   Intelligence},
   pages        = {527--549},
   year         = {2001},
   number       = {3},
   volume       = {15},
   url          = {http://www.worldscinet.com/journals/ijprai/15/preserved-docs/1503/S021800140100099X.pdf},
   abstract     = {This paper presents a text categorization system, capable of 
                   analyzing HTML/text documents collected from the Web. The system 
                   is a component of a more extensive intelligent agent for adaptive 
                   information filtering on the Web. It is based on a hybrid 
                   case-based architecture, where two multilayer perceptrons are 
                   integrated into a case-based reasoner. An empirical evaluation of 
                   the system was performed by means of a confidence interval 
                   technique. The experimental results obtained are encouraging and 
                   support the choice of a hybrid case-based approach to text 
                   categorization.},
}
@inProceedings{Geutner93,
   author       = {Petra Geutner and Uli Bodenhausen and Alex Waibel},
   title        = {Flexibility Through Incremental Learning: Neural Networks for 
                   Text Categorization},
   booktitle    = {Proceedings of WCNN-93, World Congress on Neural Networks},
   publisher    = {},
   editor       = {},
   year         = {1993},
   address      = {Portland, {US}},
   pages        = {24--27},
   url          = {http://werner.ira.uka.de/papers/speech/1993/WCNN_93_petra_geutner.ps.gz},
   abstract     = {In this paper we show an adaptive incremental learning algorithm 
                   that learns interactively to classify text messages (here: 
                   emails) into categories without the need for lengthy batch 
                   training runs. The algorithm was evaluated on a large database of 
                   email messages that fall into five subjective categories. As 
                   control experiment best human categorization performance was 
                   established at 79.4\% for this task. The best of all 
                   connectionist architectures presented here achieves near human 
                   performance (79.1\%). This architecture acquires its language 
                   model and dictionary adaptively and hence avoids handcoding of 
                   either. The learning algorithm combines an adaptive phase which 
                   instantly updates dictionary and weights during interaction and a 
                   tuning phase which fine tunes for performance using previously 
                   seen data. Such systems can be deployed in various applications 
                   where instantaneous interactive learning is necessary such as 
                   on-line email or news categorization, text summarization and 
                   information filtering in general.},
}
@inProceedings{Ghani00,
   author       = {Rayid Ghani},
   title        = {Using error-correcting codes for text classification},
   booktitle    = {Proceedings of ICML-00, 17th International Conference on Machine 
                   Learning},
   editor       = {Pat Langley},
   year         = {2000},
   address      = {Stanford, {US}},
   pages        = {303--310},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www.cs.cmu.edu/~rayid/mypapers/ecoc-icml.ps},
   abstract     = {This paper explores in detail the use of Error Correcting Output 
                   Coding (ECOC) for learning text classifiers. We show that the 
                   accuracy of a Naive Bayes Classifier over text classification 
                   tasks can be significantly improved by taking advantage of the 
                   error-correcting properties of the code. We also explore the use 
                   of different kinds of codes, namely Error-Correcting Codes, 
                   Random Codes, and Domain and Data-specific codes and give 
                   experimental results for each of them. The ECOC method scales 
                   well to large data sets with a large number of classes. 
                   Experiments on a real-world data set show a reduction in 
                   classification error by up to 66\% over the traditional Naive 
                   Bayes Classifier. We also compare our empirical results to 
                   semi-theoretical results and find that the two closely agree.},
}
@inProceedings{Ghani01,
   author       = {Rayid Ghani and Se{\'{a}}n Slattery and Yiming Yang},
   title        = {Hypertext Categorization using Hyperlink Patterns and Meta Data},
   booktitle    = {Proceedings of ICML-01, 18th International Conference on Machine 
                   Learning},
   editor       = {Carla Brodley and Andrea Danyluk},
   address      = {Williams College, {US}},
   year         = {2001},
   pages        = {178--185},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www.cs.cmu.edu/~yiming/papers.yy/hypertext-icml01.ps.gz},
   abstract     = {Hypertext poses new text classification research challenges as 
                   hyperlinks, content of linked documents, and meta data about 
                   related web sites all provide richer sources of information for 
                   hypertext classification that are not available in traditional 
                   text classification. We investigate the use of such information 
                   for representing web sites, and the effectiveness of different 
                   classifiers (Naive Bayes, Nearest Neighbor, and {\sc Foil}) in 
                   exploiting those representations. We find that using words in web 
                   pages alone often yields suboptimal performance of classifiers, 
                   compared to exploiting additional sources of information beyond 
                   document content. On the other hand, we also observe that linked 
                   pages can be more harmful than helpful when the linked 
                   neighborhoods are highly ``noisy'' and that links have to be used 
                   in a careful manner. More importantly, our investigation suggests 
                   that meta data which is often available, or can be acquired using 
                   Information Extraction techniques, can be extremely useful for 
                   improving classification accuracy. Finally, the relative 
                   performance of the different classifiers being tested gives us 
                   insights into the strengths and limitations of our algorithms for 
                   hypertext classification.},
}
@inProceedings{Ghani01a,
   author       = {Rayid Ghani},
   title        = {Combining Labeled and Unlabeled data for Text Classification with 
                   a Large Number of Categories},
   booktitle    = {Proceedings of the IEEE International Conference on Data Mining},
   editor       = {Nick Cercone and Tsau Young Lin and Xindong Wu},
   address      = {San Jose, {US}},
   year         = {2001},
   pages        = {597--598},
   publisher    = {{IEEE} Computer Society, Los Alamitos, {US}},
   url          = {http://www.cs.cmu.edu/~rayid/mypapers/icdm01.ps},
   abstract     = {We develop a framework to incorporate unlabeled data in the 
                   Error-Correcting Output Coding (ECOC) setup by de-composing 
                   multiclass problems into multiple binary prob-lems and then use 
                   Co-Training to learn the individual bi-nary classification 
                   problems. We show that our method is especially useful for 
                   classification tasks involving a large number of categories where 
                   Co-training doesn¹t perform very well by itself and when combined 
                   with ECOC, outper-forms several other algorithms that combine 
                   labeled and unlabeled data for text classification in terms of 
                   accuracy, precision-recall tradeoff, and efficiency.},
}
@inProceedings{Ghani02,
   author       = {Rayid Ghani},
   title        = {Combining Labeled and Unlabeled Data for MultiClass Text 
                   Categorization},
   booktitle    = {Proceedings of ICML-02, 19th International Conference on Machine 
                   Learning},
   editor       = {},
   year         = {2002},
   address      = {Sydney, {AU}},
   pages        = {},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www.accenture.com/xdoc/en/services/technology/publications/Ghani-ICML02.pdf},
   abstract     = {Supervised learning techniques for text classification often 
                   require a large number of labeled examples to learn accurately. 
                   One way to reduce the amount of labeled data required is to 
                   develop algorithms that can learn effectively from a small number 
                   of labeled examples augmented with a large number of unlabeled 
                   examples. Current text learning techniques for combining labeled 
                   and unlabeled, such as EM and Co-Training, are mostly applicable 
                   for classification tasks with a small number of classes and do 
                   not scale up well for large multiclass problems. In this paper, 
                   we develop a framework to incorporate unlabeled data in the 
                   Error-Correcting Output Coding (ECOC) setup by first decomposing 
                   multiclass problems into multiple binary problems and then using 
                   Co-Training to learn the individual binary classification 
                   problems. We show that our method is especially useful for text 
                   classification tasks involving a large number of categories and 
                   outperforms other semi-supervised learning techniques such as EM 
                   and Co-Training. In addition to being highly accurate, this 
                   method utilizes the hamming distance from ECOC to provide 
                   high-precision results. We also present results with algorithms 
                   other than co-training in this framework and show that 
                   co-training is uniquely suited to work well within ECOC.},
}
@inProceedings{Giorgetti03,
   author       = {Daniela Giorgetti and Fabrizio Sebastiani},
   title        = {Multiclass Text Categorization for Automated Survey Coding},
   year         = {2003},
   address      = {Melbourne, {US}},
   booktitle    = {Proceedings of SAC-03, 18th ACM Symposium on Applied Computing},
   publisher    = {{ACM} Press, New York, {US}},
   pages        = {798--802},
   url          = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/SAC03a.pdf},
   abstract     = {\emph{Survey coding} is the task of assigning a symbolic code 
                   from a predefined set of such codes to the answer given in 
                   response to an open-ended question in a questionnaire (aka 
                   \emph{survey}). We formulate the problem of automated survey 
                   coding as a \emph{text categorization} problem, i.e.\ as the 
                   problem of learning, by means of supervised machine learning 
                   techniques, a model of the association between answers and codes 
                   from a training set of pre-coded answers, and applying the 
                   resulting model to the classification of new answers. In this 
                   paper we experiment with two different learning techniques, one 
                   based on na\"{\i}ve Bayesian classification and the other one 
                   based on multiclass support vector machines, and test the 
                   resulting framework on a corpus of social surveys. The results we 
                   have obtained significantly outperform the results achieved by 
                   previous automated survey coding approaches.},
}
@article{Giorgetti03a,
   author       = {Daniela Giorgetti and Fabrizio Sebastiani},
   title        = {Automating Survey Coding by Multiclass Text Categorization 
                   Techniques},
   journal      = {Journal of the American Society for Information Science and 
                   Technology},
   year         = {2003},
   volume       = {},
   number       = {},
   pages        = {},
   url          = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/JASIST03.pdf},
   abstract     = {\emph{Survey coding} is the task of assigning a symbolic code 
                   from a predefined set of such codes to the answer given in 
                   response to an open-ended question in a questionnaire (aka 
                   \emph{survey}). This task is usually carried out in order to 
                   group respondents according to a predefined scheme based on their 
                   answers. Survey coding has several applications, especially in 
                   the social sciences, ranging from the simple classification of 
                   respondents to the extraction of statistics on political 
                   opinions, health and lifestyle habits, customer satisfaction, 
                   brand fidelity, and patient satisfaction. Survey coding is a 
                   difficult task, since the code that should be attributed to a 
                   respondent based on the answer she has given is a matter of 
                   subjective judgment, and thus requires expertise. It is thus 
                   unsurprising that this task has traditionally been performed 
                   manually, by trained coders. Some attempts have been made at 
                   automating this task, most of them based on detecting the 
                   similarity between the answer and textual descriptions of the 
                   meanings of the candidate codes. We take a radically new stand, 
                   and formulate the problem of automated survey coding as a 
                   \emph{text categorization} problem, i.e.\ as the problem of 
                   learning, by means of supervised machine learning techniques, a 
                   model of the association between answers and codes from a 
                   training set of pre-coded answers, and applying the resulting 
                   model to the classification of new answers. In this paper we 
                   experiment with two different learning techniques, one based on 
                   na\"{\i}ve Bayesian classification and the other one based on 
                   multiclass support vector machines, and test the resulting 
                   framework on a corpus of social surveys. The results we have 
                   obtained significantly outperform the results achieved by 
                   previous automated survey coding approaches.},
   note         = {Forthcoming},
}
@inProceedings{Glover02,
   author       = {Eric J. Glover and Kostas Tsioutsiouliklis and Steve Lawrence and 
                   David M. Pennock and Gary W. Flake},
   title        = {Using {W}eb structure for classifying and describing {W}eb pages},
   booktitle    = {Proceedings of WWW-02, International Conference on the World Wide 
                   Web},
   address      = {Honolulu, {US}},
   year         = {2002},
   pages        = {562--569},
   publisher    = {{ACM} Press, New York, {US}},
   url          = {http://www.cs.princeton.edu/~kt/www02.ps},
   abstract     = {The structure of the web is increasingly being used to improve 
                   organization, search, and analysis of information on the web. For 
                   example, Google uses the text in citing documents (documents that 
                   link to the target document) for search. We analyze the relative 
                   utility of document text, and the text in citing documents near 
                   the citation, for classification and description. Results show 
                   that the text in citing documents, when available, often has 
                   greater discriminative and descriptive power than the text in the 
                   target document itself. The combination of evidence from a 
                   document and citing documents can improve on either information 
                   source alone. Moreover, by ranking words and phrases in the 
                   citing documents according to expected entropy loss, we are able 
                   to accurately name clusters of web pages, even with very few 
                   positive examples. Our results confirm, quantify, and extend 
                   previous research using web structure in these areas, introducing 
                   new methods for classification and description of pages.},
}
@inProceedings{Goldberg95,
   author       = {Goldberg, Jeffrey L.},
   title        = {{CDM}: an approach to learning in text categorization},
   booktitle    = {Proceedings of ICTAI-95, 7th International Conference on Tools 
                   with Artificial Intelligence},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   editor       = {},
   address      = {Herndon, {US}},
   year         = {1995},
   pages        = {258--265},
   url          = {},
   note         = {An extended version appears as~\cite{Goldberg96}},
   abstract     = {The category discrimination method (CDM) is a new learning 
                   algorithm designed for text categorization. The motivation is 
                   that there are statistical problems associated with natural 
                   language text when it is applied as input to existing machine 
                   learning algorithms (too much noise, too many features, skewed 
                   distribution). The bases of the CDM are research results about 
                   the way that humans learn categories and concepts vis-a-vis 
                   contrasting concepts. The essential formula is cue validity 
                   borrowed from cognitive psychology, and used to select from all 
                   possible single word-based features the `best` predictors of a 
                   given category. The hypothesis that CDM`s performance exceeds two 
                   non-domain specific algorithms, Bayesian classification and 
                   decision tree learners, is empirically tested.},
}
@article{Goldberg96,
   author       = {Goldberg, Jeffrey L.},
   title        = {{CDM}: an approach to learning in text categorization},
   journal      = {International Journal on Artificial Intelligence Tools},
   year         = {1996},
   number       = {1/2},
   volume       = {5},
   pages        = {229--253},
   url          = {},
   abstract     = {The Category Discrimination Method (CDM) is a new machine 
                   learning algorithm designed specifically for text categorization. 
                   The motivation is that there are statistical problems associated 
                   with natural language text when it is applied as input to 
                   existing machine learning algorithms (too much noise, too many 
                   features, skewed distribution). The bases of the CDM are research 
                   results about the way that humans learn categories and concepts 
                   vis-a-vis contrasting concepts. The essential formula is cue 
                   validity borrowed from cognitive psychology, and used to select 
                   from all possible single word based features, the best predictors 
                   of a given category. The hypothesis that CDM's performance will 
                   exceed two non domain specific algorithms, Bayesian 
                   classification and decision tree learners, is empirically tested.},
}
@inProceedings{Goodman90,
   author       = {Marc Goodman},
   title        = {{\sc Prism}: a case-based telex classifier},
   booktitle    = {Proceedings of IAAI-90, 2nd Conference on Innovative Applications 
                   of Artificial Intelligence},
   publisher    = {{AAAI} Press, Menlo Park, {US}},
   editor       = {Alain Rappaport and Reid Smith},
   year         = {1990},
   address      = {},
   pages        = {25--37},
   url          = {},
   abstract     = {},
}
@inProceedings{Goevert99,
   author       = {Norbert G{\"{o}}vert and Mounia Lalmas and Norbert Fuhr},
   title        = {A probabilistic description-oriented approach for categorising 
                   {W}eb documents},
   booktitle    = {Proceedings of CIKM-99, 8th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {},
   year         = {1999},
   address      = {Kansas City, {US}},
   pages        = {475--482},
   url          = {http://ls6-www.informatik.uni-dortmund.de/ir/publications/1999/Goevert_etal:99.html},
   abstract     = {The automatic categorisation of web documents is becoming crucial 
                   for organising the huge amount of information available in the 
                   Internet. We are facing a new challenge due to the fact that web 
                   documents have a rich structure and are highly heterogeneous. Two 
                   ways to respond to this challenge are (1) using a representation 
                   of the content of web documents that captures these two 
                   characteristics and (2) using more effective classifiers. Our 
                   categorisation approach is based on a probabilistic 
                   description-oriented representation of web documents, and a 
                   probabilistic interpretation of the k-nearest neighbour 
                   classifier. With the former, we provide an enhanced document 
                   representation that incorporates the structural and heterogeneous 
                   nature of web documents. With the latter, we provide a 
                   theoretical sound justification for the various parameters of the 
                   k-nearest neighbour classifier. Experimental results show that 
                   (1) using an enhanced representation of web documents is crucial 
                   for an effective categorisation of web documents, and (2) a 
                   theoretical interpretation of the k-nearest neighbour classifier 
                   gives us improvement over the standard k-nearest neighbour 
                   classifier.},
}
@inProceedings{Gomez02,
   author       = {G{\'o}mez-Hidalgo, Jos{\'e} M. and De Buenaga Rodr{\'{\i}}guez, 
                   Jos{\'e} M. and Ureña L{\'o}pez, Luis A. and Mart{\'{\i}}n 
                   Valdivia, Maria T. and Garc{\'{\i}}a Vega, Manuel},
   title        = {Integrating Lexical Knowledge in Learning-Based Text 
                   Categorization},
   booktitle    = {Proceedings of JADT-02, 6th International Conference on the 
                   Statistical Analysis of Textual Data},
   publisher    = {},
   editor       = {},
   address      = {St-Malo, {FR}},
   pages        = {},
   year         = {2002},
   url          = {http://www.cavi.univ-paris3.fr/lexicometrica/jadt/jadt2002/PDF-2002/gomez_debuenaga_urena_martin_garcia.pdf
                   },
   abstract     = {Automatic Text Categorization (ATC) is an important task in the 
                   field of Information Access. The prevailing approach to ATC is 
                   making use of a a collection of prelabeled texts for the 
                   induction of a document classifier through learning methods. With 
                   the increasing availability of lexical resources in electronic 
                   form (including Lexical Databases (LDBs), Machine Readable 
                   Dictionaries, etc.), there is an interesting opportunity for the 
                   integration of them in learning-based ATC. In this paper, we 
                   present an approach to the integration of lexical knowledge 
                   extracted from the LDB WordNet in learning-based ATC, based on 
                   Stacked Generalization (SG). The method we suggest is based on 
                   combining the lexical knowledge extracted from the LDB 
                   interpreted as a classifier with a learning-based classifier, 
                   through SG. We have performed experiments which results show that 
                   the ideas we describe are promising and deserve further 
                   investigation.},
}
@inProceedings{Gomez02a,
   author       = {G{\'o}mez-Hidalgo, Jos{\'e} M.},
   title        = {Evaluating Cost-Sensitive Unsolicited Bulk Email Categorization},
   booktitle    = {Proceedings of SAC-02, 17th ACM Symposium on Applied Computing},
   editor       = {},
   address      = {Madrid, {ES}},
   pages        = {615--620},
   year         = {2002},
   url          = {http://doi.acm.org/10.1145/508791.508911},
   abstract     = {In the recent years, Unsolicited Bulk Email has became an 
                   increasingly important problem, with a big economic impact. In 
                   this paper, we discuss cost-sensitive Text Categorization methods 
                   for UBE filtering. In concrete, we have evaluated a range of 
                   Machine Learning methods for the task (C4.5, Naive Bayes, PART, 
                   Support Vector Machines and Rocchio), made cost sensitive through 
                   several methods (Threshold Optimization, Instance Weighting, and 
                   Meta-Cost). We have used the Receiver Operating Characteristic 
                   Convex Hull method for the evaluation, that best suits 
                   classification problems in which target conditions are not known, 
                   as it is the case. Our results do not show a dominant algorithm 
                   nor method for making algorithms cost-sensitive, but are the best 
                   reported on the test collection used, and approach real-world 
                   hand-crafted classifiers accuracy.},
}
@article{Gray71,
   author       = {W. A. Gray and A. J. Harley},
   title        = {Computer-assisted indexing},
   journal      = {Information Storage and Retrieval},
   year         = {1971},
   volume       = {7},
   number       = {4},
   pages        = {167--174},
   url          = {},
   abstract     = {},
}
@inProceedings{Guthrie94,
   author       = {Louise Guthrie and Elbert Walker and Joe A. Guthrie},
   title        = {Document classification by machine: theory and practice},
   booktitle    = {Proceedings of COLING-94, 15th International Conference on 
                   Computational Linguistics},
   publisher    = {},
   editor       = {},
   address      = {Kyoto, {JP}},
   year         = {1994},
   pages        = {1059--1063},
   url          = {},
   abstract     = {},
}
@inCollection{Guthrie99,
   author       = {Louise Guthrie and Joe A. Guthrie and James Leistensnider},
   title        = {Document classification and routing},
   booktitle    = {Natural language information retrieval},
   editor       = {Tomek Strzalkowski},
   year         = {1999},
   pages        = {289--310},
   publisher    = {Kluwer Academic Publishers},
   address      = {Dordrecht, {NL}},
   url          = {},
   abstract     = {},
}
@inProceedings{Hadjarian01,
   author       = {Ali Hadjarian and Jerzy Bala and Peter Pachowicz},
   title        = {Text Categorization through Multistrategy Learning and 
                   Visualization},
   booktitle    = {Proceedings of CICLING-01, 2nd International Conference on 
                   Computational Linguistics and Intelligent Text Processing},
   year         = {2001},
   editor       = {Alexander Gelbukh},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Mexico City, {ME}},
   note         = {Published in the ``Lecture Notes for Computer Science'' series, 
                   number 2004},
   pages        = {423--436},
   url          = {http://link.springer.de/link/service/series/0558/papers/2004/20040437.pdf},
   abstract     = {This paper introduces a multistrategy learning approach to the 
                   categorization of text documents. The approach benefits from two 
                   existing, and in our view complimentary, sets of categorization 
                   techniques: those based on Rocchio's algorithm and those 
                   belonging to the rule learning class of machine learning 
                   algorithms. Visualization is used for the presentation of the 
                   output of learning},
}
@inProceedings{Hamill78,
   author       = {Hamill, Karen A. and Zamora, Antonio},
   title        = {An automatic document classification system using pattern 
                   recognition techniques},
   booktitle    = {Proceedings of ASIS-78, 41st Annual Meeting of the American 
                   Society for Information Science},
   publisher    = {American Society for Information Science, Washington, {US}},
   editor       = {Everett H. Brenner},
   year         = {1978},
   address      = {New York, {US}},
   pages        = {152--155},
   url          = {},
   abstract     = {},
}
@article{Hamill80,
   author       = {Hamill, Karen A. and Zamora, Antonio},
   title        = {The Use of titles for Automatic Document Classification},
   journal      = {Journal of the American Society for Information Science},
   year         = {1980},
   number       = {6},
   pages        = {396--402},
   volume       = {33},
   url          = {},
   abstract     = {},
}
@inProceedings{Han01,
   author       = {Eui-Hong Han and George Karypis and Vipin Kumar},
   title        = {Text Categorization Using Weight-Adjusted $k$-Nearest Neighbor 
                   Classification},
   booktitle    = {Proceedings of PAKDD-01, 5th Pacific-Asia Conferenece on 
                   Knowledge Discovery and Data Mining},
   editor       = {David Cheung and Qing Li and Graham Williams},
   year         = {2001},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Hong Kong, {CN}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2035},
   pages        = {53--65},
   url          = {http://link.springer.de/link/service/series/0558/papers/2035/20350053.pdf},
   abstract     = {Text categorization presents unique challenges due to the large 
                   number of attributes present in the data set, large number of 
                   training samples, attribute dependency, and multi-modality of 
                   categories. Existing classification techniques have limited 
                   applicability in the data sets of these natures. In this paper, 
                   we present a Weight Adjusted k-Nearest Neighbor (WAKNN) 
                   classification that learns feature weights based on a greedy hill 
                   climbing technique. We also present two performance optimizations 
                   of WAKNN that improve the computational performance by a few 
                   orders of magnitude, but do not compromise on the classification 
                   quality. We experimentally evaluated WAKNN on 52 document data 
                   sets from a variety of domains and compared its performance 
                   against several classification algorithms, such as C4.5, RIPPER, 
                   Naive-Bayesian, PEBLS and VSM. Experimental results on these data 
                   sets confirm that WAKNN consistently outperforms other existing 
                   classification algorithms.},
}
@article{Hanauer96,
   author       = {David Hanauer},
   title        = {Integration of phonetic and graphic features in poetic text 
                   categorization judgements},
   journal      = {Poetics},
   year         = {1996},
   volume       = {23},
   number       = {5},
   pages        = {363--380},
   url          = {},
   abstract     = {The experiments reported in this paper deal with the relationship 
                   between specific formal textual features, i.e. graphic and 
                   phonetic information, and the reader's literary educational 
                   background in the categorization of poetic texts. In two 
                   experiments, the research method of Information Integration 
                   Theory was employed in order to test two hypotheses relating to 
                   the radical conventionalist and traditional positions on the role 
                   of specific formal textual features in the categorization of 
                   poetic texts. Twenty subjects from expert or novice literary 
                   reading experience backgrounds were, in two experiments, required 
                   to rate two parallel sets of graphically and phonetically 
                   manipulated poems. The results reveal that subjects are sensitive 
                   to the manipulations of graphic and phonetic information and use 
                   the same additive information integration rule in making poetic 
                   text categorization judgements. The expert literary readers were 
                   found to assign significantly higher ratings to all versions of 
                   the manipulated poems than the novice readers.},
}
@inProceedings{Hayes88,
   author       = {Philip J. Hayes and Laura E. Knecht and Monica J. Cellio},
   title        = {A news story categorization system},
   booktitle    = {Proceedings of ANLP-88, 2nd Conference on Applied Natural 
                   Language Processing},
   publisher    = {Association for Computational Linguistics, Morristown, {US}},
   address      = {Austin, {US}},
   editor       = {},
   year         = {1988},
   pages        = {9--17},
   url          = {},
   note         = {Reprinted in Karen Sparck Jones and Peter Willett (eds.), 
                   ``Readings in Information Retrieval'', Morgan Kaufmann, San 
                   Francisco, US, 1997, pp.\ 518--526.},
   abstract     = {The article describes a pilot version of a commercial application 
                   of natural language processing techniques to the problem of 
                   categorizing new stories into broad topic categories. The system 
                   does not perform a complete semantic or syntactic analyses of the 
                   input stories. Its categorizations are dependent on fragmentary 
                   recognition using pattern-matching techniques. The fragments it 
                   looks for are determined by a set of knowledge-based rules. The 
                   accuracy of the system is only slightly lower than that of human 
                   categorizers.},
}
@inProceedings{Hayes90a,
   author       = {Philip J. Hayes and Peggy M. Andersen and Irene B. Nirenburg and 
                   Linda M. Schmandt},
   title        = {{\sc Tcs}: a shell for content-based text categorization},
   booktitle    = {Proceedings of CAIA-90, 6th IEEE Conference on Artificial 
                   Intelligence Applications},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   editor       = {},
   year         = {1990},
   address      = {Santa Barbara, {US}},
   pages        = {320--326},
   url          = {},
   abstract     = {The kind of application that the text categorization shell, TCS, 
                   can produce is characterized. Many of its applications have great 
                   commercial value. The design goals for TCS are discussed, and 
                   other approaches to text categorization in the light of these 
                   goals are examined. The TCS and how it meets its design goals are 
                   described, and examples of applications built with TCS are given. 
                   A text-categorization application developed with TCS consists of 
                   the TCS run-time system and a rule base. The rule base defines 
                   what categories the application can assign to texts and contains 
                   rules that make the categorization decisions for particular 
                   texts. The data-driven nature of TCS allows it is to satisfy 
                   fully the requirements of ease of application development, 
                   portability to other applications and maintainability.},
}
@inProceedings{Hayes90,
   author       = {Philip J. Hayes and Steven P. Weinstein},
   title        = {{\sc Construe/Tis}: a system for content-based indexing of a 
                   database of news stories},
   booktitle    = {Proceedings of IAAI-90, 2nd Conference on Innovative Applications 
                   of Artificial Intelligence},
   publisher    = {{AAAI} Press, Menlo Park, {US}},
   editor       = {Alain Rappaport and Reid Smith},
   year         = {1990},
   pages        = {49--66},
   url          = {},
   abstract     = {},
}
@article{He03,
   author       = {Ji He and Ah-Hwee Tan and Chew-Lim Tan},
   title        = {On Machine Learning Methods for {C}hinese Document Categorization},
   journal      = {Applied Intelligence},
   year         = {2003},
   volume       = {18},
   number       = {3},
   pages        = {311--322},
   url          = {http://www.kluweronline.com/issn/0924-669X},
   abstract     = {This paper reports our comparative evaluation of three machine 
                   learning methods, namely k Nearest Neighbor (kNN), SupportVector 
                   Machines (SVM), and Adaptive Resonance Associative Map (ARAM) for 
                   Chinese document categorization. Based on two Chinese corpora, a 
                   series of controlled experiments evaluated their learning 
                   capabilities and efficiency in mining text classification 
                   knowledge. Benchmark experiments showed that their predictive 
                   performance were roughly comparable, especially on clean and well 
                   organized data sets. While kNN and ARAM yield better performances 
                   than SVM on small and clean data sets, SVM and ARAM significantly 
                   outperformed kNN on noisy data. Comparing efficiency, kNN was 
                   notably more costly in terms of time and memory than the other 
                   two methods. SVM is highly efficient in learning from well 
                   organized samples of moderate size, although on relatively large 
                   and noisy data the efficiency of SVM and ARAM are comparable.},
}
@article{Heaps73,
   author       = {H.S. Heaps},
   title        = {A theory of relevance for automatic document classification},
   year         = {1973},
   journal      = {Information and Control},
   volume       = {22},
   number       = {3},
   pages        = {268-278},
   url          = {},
   abstract     = {},
}
@inProceedings{Hearst91,
   author       = {Marti A. Hearst},
   title        = {Noun homograph disambiguation using local context in large 
                   corpora},
   booktitle    = {Proceedings of the 7th Annual Conference of the University of 
                   Waterloo Centre for the New Oxford English Dictionary},
   publisher    = {},
   editor       = {},
   year         = {1991},
   pages        = {1--22},
   address      = {Oxford, {UK}},
   url          = {ftp://parcftp.xerox.com/pub/hearst/oed91.ps.gz},
   abstract     = {This paper describes an accurate, relatively inexpensive method 
                   for the disambiguation of noun homographs using large text 
                   corpora. The algorithm checks the context surrounding the target 
                   noun against that of previously observed instances and chooses 
                   the sense for which the most evidence is found, where evidence 
                   consists of a set of orthographic, syntactic, and lexical 
                   features. Because the sense distinctions made are coarse, the 
                   disambiguation can be accomplished without the expense of 
                   knowledge bases or inference mechanisms. An implementation of the 
                   algorithm is described which, starting with a small set of 
                   hand-labeled instances, improves its results automatically via 
                   unsupervised training. The approach is compared to other attempts 
                   at homograph disambiguation using both machine readable 
                   dictionaries and unrestricted text and the use of training 
                   instances is determined to be a crucial difference.},
}
@proceedings{Hearst96a,
   editor       = {Marti A. Hearst and Haym Hirsh},
   title        = {Machine Learning in Information Access. Papers from the 1996 AAAI 
                   Spring Symposium},
   institution  = {Americal Association for Artificial Intelligence},
   address      = {Stanford, {US}},
   year         = {1996},
   note         = {Available as Technical Report SS-96-05},
   url          = {},
   abstract     = {},
}
@inProceedings{Hersh94,
   author       = {William Hersh and Christopher Buckley and T.J. Leone and David 
                   Hickman},
   title        = {{{\sc Ohsumed}}: an interactive retrieval evaluation and new 
                   large text collection for research},
   booktitle    = {Proceedings of SIGIR-94, 17th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {W. Bruce Croft and Cornelis J. Van Rijsbergen},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Dublin, {IE}},
   pages        = {192--201},
   year         = {1994},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/188490/p192-hersh/p192-hersh.pdf},
   abstract     = {A series of information retrieval experiments was carried out 
                   with a computer installed in a medical practice setting for 
                   relatively inexperienced physician end-users. Using a commercial 
                   MEDLINE product based on the vector space model, these physicians 
                   searched just as effectively as more experienced searchers using 
                   Boolean searching. The results of this experiment were 
                   subsequently used to create a new large medical test collection, 
                   which was used in experiments with the SMART retrieval system to 
                   obtain baseline performance data as well as compare SMART with 
                   the other searchers.},
}
@inProceedings{Hoashi00,
   author       = {Keiichiro Hoashi and Kazunori Matsumoto and Naomi Inoue and Kazuo 
                   Hashimoto},
   title        = {Document filtering methods using non-relevant information profile},
   booktitle    = {Proceedings of SIGIR-00, 23rd ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Athens, {GR}},
   year         = {2000},
   pages        = {176--183},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/345508/p176-hoashi/p176-hoashi.pdf},
   abstract     = {Document filtering is a task to retrieve documents relevant to a 
                   user's profile from a flow of documents. Generally, filtering 
                   systems calculate the similarity between the profile and each 
                   incoming document, and retrieve documents with similarity higher 
                   than a threshold. However, many systems set a relatively high 
                   threshold to reduce retrieval of non-relevant documents, which 
                   results in the ignorance of many relevant documents. In this 
                   paper, we propose the use of a non-relevant information profile 
                   to reduce the mistaken retrieval of non-relevant documents. 
                   Results from experiments show that this filter has successfully 
                   rejected a sufficient number of non-relevant documents, resulting 
                   in an improvement of filtering performance.},
}
@inProceedings{Hoch94,
   author       = {Rainer Hoch},
   title        = {Using {IR} techniques for text classification in document 
                   analysis},
   booktitle    = {Proceedings of SIGIR-94, 17th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {W. Bruce Croft and Cornelis J. Van Rijsbergen},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   year         = {1994},
   address      = {Dublin, {IE}},
   pages        = {31--40},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/188490/p31-hoch/p31-hoch.pdf},
   abstract     = {This paper presents the INFOCLAS system applying statistical 
                   methods of information retrieval for the classification of German 
                   business letters into corresponding message types such as order, 
                   offer, enclosure, etc. INFOCLAS is a first step towards the 
                   understanding of documents proceeding to a classification-driven 
                   extraction of information. The system is composed of two main 
                   modules: the central indexer (extraction and weighting of 
                   indexing terms) and the classifier (classification of business 
                   letters into given types). The system employs several knowledge 
                   sources including a letter database, word frequency statistics 
                   for German, lists of message type specific words, morphological 
                   knowledge as well as the underlying document structure. As 
                   output, the system evaluates a set of weighted hypotheses about 
                   the type of the actual letter. Classification of documents allow 
                   the automatic distribution or archiving of letters and is also an 
                   excellent starting point for higher-level document analysis.},
}
@article{Hoyle73,
   author       = {W.G. Hoyle},
   title        = {Automatic indexing and generation of classification by algorithm},
   journal      = {Information Storage and Retrieval},
   year         = {1973},
   volume       = {9},
   number       = {4},
   pages        = {233--242},
   url          = {},
   abstract     = {A system of automatic indexing based on Bayes' theorem is 
                   described briefly. In assigning 124 documents to 9 categories, 
                   there were 97 cases of agreement with professional indexers. 
                   Using a collection factor, based on 87 per cent human consistency 
                   from other courses, the computer appears then to index with 90 
                   per cent accuracy in this case. The technique is then used with 
                   two randomized sample document groups drawn from nine categories. 
                   Each group in turn is used as the basis for indexing the other. 
                   The computer knows only the number of categories. After 8 cycles 
                   the computer is found to have formed 9 groups consisting of about 
                   50 per cent of documents that were also lumped together by 
                   professional indexers on the basis of subject content. A new 
                   measure of performance is proposed and some other applications of 
                   the technique indicated.},
}
@inProceedings{Hsu99,
   author       = {Wen-Lin Hsu and Sheau-Dong Lang},
   title        = {Classification algorithms for {NETNEWS} articles},
   booktitle    = {Proceedings of CIKM-99, 8th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {},
   year         = {1999},
   address      = {Kansas City, {US}},
   pages        = {114--121},
   url          = {http://www.acm.org/pubs/articles/proceedings/cikm/319950/p114-hsu/p114-hsu.pdf},
   abstract     = {We propose several algorithms using the vector space model to 
                   classify the news articles posted on the NETNEWS according to the 
                   newsgroup categories. The baseline method combines the terms of 
                   all the articles of each newsgroup in the training set to 
                   represent the newsgroups as single vectors. After training, the 
                   incoming news articles are classified based on their similarity 
                   to the existing newsgroup categories. We propose to use the 
                   following techniques to improve the classification performance of 
                   the baseline method: (1) use routing (classification) accuracy 
                   and the similarity values to refine the training set; (2) update 
                   the underlying term structures periodically during testing; and 
                   (3) apply k-means clustering to partition the newsgroup articles 
                   and represent each newsgroup by k vectors. Our test collection 
                   consists of the real news articles and the 519 subnewsgroups 
                   under the REC newsgroup of NETNEWS in a period of 3 months. Our 
                   experimental results demonstrate that the technique of refining 
                   the training set reduces from one-third to two-thirds of the 
                   storage. The technique of periodical updates improves the routing 
                   accuracy ranging from 20\% to 100\% but incurs runtime overhead. 
                   Finally, representing each newsgroup by k vectors (with k = 2 or 
                   3) using clustering yields the most significant improvement in 
                   routing accuracy, ranging from 60\% to lOO\%, while causing only 
                   slightly higher storage requirements.},
}
@inProceedings{Hsu99a,
   author       = {Wen-Lin Hsu and Sheau-Dong Lang},
   title        = {Feature Reduction and Database Maintenance in {NETNEWS} 
                   Classification},
   booktitle    = {Proceedings of IDEAS-99, 1999 International Database Engineering 
                   and Applications Symposium},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   editor       = {},
   year         = {1999},
   address      = {Montreal, {CA}},
   pages        = {137--144},
   url          = {http://dlib.computer.org/conferen/ideas/0265/pdf/02650137.pdf},
   abstract     = {We propose a statistical feature-reduction technique to filter 
                   out the most ambiguous articles in the training data for 
                   categorizing the NETNEWS articles. We also incorporate a batch 
                   updating scheme to periodically do maintenance on the term 
                   structures of the news database after training. The baseline 
                   method combines the terms of all the articles of each newsgroup 
                   in the training set to represent the newsgroups as single 
                   vectors. After training, the incoming news articles are 
                   classified based on their similarity to the existing newsgroup 
                   categories. Our implementation uses an inverted file to store the 
                   trained term structures of each newsgroup, and uses a list 
                   similar to the inverted file to buffer the newly arrival 
                   articles, for efficient routing and updating purposes. Our 
                   experimental results using real NETNEWS articles and newsgroups 
                   demonstrate (1) applying feature reduction to the training set 
                   improves the routing accuracy, efficiency, and database storage; 
                   (2) updating improves the routing accuracy; and (3) the batch 
                   technique improves the efficiency of the updating operation.},
}
@inProceedings{Huffman94,
   author       = {Stephen Huffman and Marc Damashek},
   title        = {Acquaintance: A Novel Vector-Space N-Gram Technique for Document 
                   Categorization},
   booktitle    = {Proceedings of TREC-3, 3rd Text Retrieval Conference},
   publisher    = {National Institute of Standards and Technology, Gaithersburg, {US}},
   editor       = {Donna K. Harman},
   year         = {1994},
   address      = {Gaithersburg, {US}},
   pages        = {305--310},
   url          = {},
   abstract     = {Acquaintance is the name of a novel vector-space n-gram technique 
                   for categorizing documents. The technique is completely 
                   language-independent, highly garble-resistant, and 
                   computationally simple. An unoptimized version of the algorithm 
                   was used to process the TREC database in a very short time.},
}
@inProceedings{Huffman95,
   author       = {Stephen Huffman},
   title        = {Acquaintance: Language-Independent Document Categorization by 
                   N-Grams},
   booktitle    = {Proceedings of TREC-4, 4th Text Retrieval Conference},
   publisher    = {National Institute of Standards and Technology, Gaithersburg, {US}},
   editor       = {Donna K. Harman and Ellen M. Voorhees},
   year         = {1995},
   address      = {Gaithersburg, {US}},
   pages        = {359--371},
   url          = {http://trec.nist.gov/pubs/trec4/papers/nsa.ps.gz},
   abstract     = {Acquaintance is the name of a novel vector-space n-gram for 
                   categorizing documents. The technique is completely 
                   language-independent, highly garble-resistant, and 
                   computationally simple. An unoptimized version of the algorithm 
                   was used to process the TREC database in a very short time. The 
                   TREC-3 conference provided the first public demonstration and 
                   evaluation of this new technique, and TREC-4 provided an 
                   opportunity to test its usefulness on several types of text 
                   retrieval tasks.},
}
@inProceedings{Hull94,
   author       = {Hull, David A.},
   title        = {Improving text retrieval for the routing problem using latent 
                   semantic indexing},
   booktitle    = {Proceedings of SIGIR-94, 17th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {W. Bruce Croft and Cornelis J. Van Rijsbergen},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   year         = {1994},
   address      = {Dublin, {IE}},
   pages        = {282--289},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/188490/p282-hull/p282-hull.pdf},
   abstract     = {Latent Semantic Indexing (LSI) is a novel approach to information 
                   retrieval that attempts to model the underlying structure of term 
                   associations by transforming the traditional representation of 
                   documents as vectors of weighted term frequencies to a new 
                   coordinate space where both documents and terms are represented 
                   as linear combinations of underlying semantic factors. In 
                   previous research, LSI has produced a small improvement in 
                   retrieval performance. In this paper, we apply LSI to the routing 
                   task, which operates under the assumption that a sample of 
                   relevant and non-relevant documents is available to use in 
                   constructing the query. Once again, LSI slightly improves 
                   performance. However, when LSI is used is conduction with 
                   statistical classification, there is a dramatic improvement in 
                   performance.},
}
@inProceedings{Hull96,
   author       = {David A. Hull and Jan O. Pedersen and Hinrich Sch{\"u}tze},
   title        = {Method combination for document filtering},
   booktitle    = {Proceedings of SIGIR-96, 19th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Hans-Peter Frei and Donna Harman and Peter Sch{\"{a}}uble and 
                   Ross Wilkinson},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1996},
   address      = {Z{\"{u}}rich, {CH}},
   pages        = {279--288},
   url          = {ftp://parcftp.xerox.com/pub/qca/papers/sigirfiltering96.ps},
   abstract     = {There is strong empirical and theoretic evidence that combination 
                   of retrieval methods can improve performance. In this paper, we 
                   systematically compare combination strategies in the context of 
                   document filtering, using queries from the Tipster reference 
                   corpus. We find that simple averaging strategies do indeed 
                   improve performance, but that direct averaging of probability 
                   estimates is not the correct approach. Instead, the probability 
                   estimates must be renormalized using logistic regression on the 
                   known relevance judgements. We examine more complex combination 
                   strategies but find them less successful due to the high 
                   correlations among our filtering methods which are optimized over 
                   the same training data and employ similar document 
                   representations.},
}
@inProceedings{Hull98,
   author       = {David A. Hull},
   title        = {The {TREC-7} filtering track: description and analysis},
   booktitle    = {Proceedings of TREC-7, 7th Text Retrieval Conference},
   publisher    = {National Institute of Standards and Technology, Gaithersburg, {US}},
   editor       = {Ellen M. Voorhees and Donna K. Harman},
   year         = {1998},
   address      = {Gaithersburg, {US}},
   pages        = {33--56},
   url          = {http://trec.nist.gov/pubs/trec7/papers/tr7filter/paper.ps},
   abstract     = {This article describes the experiments conducted in the TREC-7 
                   filtering track, which consisted of three subtasks: adaptive 
                   filtering, batch filtering, and routing. The focus this YEAR is 
                   on adaptive filtering, where the system begins with only the 
                   topic statement and must interactively adjust a filtering profile 
                   constructed from that topic in response to on-line feedback. In 
                   addition to motivating the task and describing the practical 
                   details of participating in the track, this document includes a 
                   detailed graphical presentation of the experimental results and 
                   provides a brief overall analysis of the performance data.},
}
@inProceedings{Ipeirotis01,
   author       = {Panagiotis G. Ipeirotis and Luis Gravano and Mehran Sahami},
   title        = {Probe, count, and classify: categorizing hidden {W}eb databases},
   booktitle    = {Proceedings of SIGMOD-01, ACM International Conference on 
                   Management of Data},
   editor       = {Walid G. Aref},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {2001},
   address      = {Santa Barbara, {US}},
   pages        = {67--78},
   url          = {http://doi.acm.org/10.1145/375663.375671},
   abstract     = {The contents of many valuable web-accessible databases are only 
                   accessible through search interfaces and are hence in-visible to 
                   traditional web ``crawlers''. Recent studies have estimated the 
                   size of this ''hidden web'' to be 500 billion pages, while the 
                   size of the ``crawlable'' web is only an es-timated two billion 
                   pages. Recently, commercial web sites have started to manually 
                   organize web-accessible databases into Yahoo!-like hierarchical 
                   classification schemes. In this paper, we introduce a method for 
                   automating this classi-fication process by using a small number 
                   of query probes. To classify a database, our algorithm does not 
                   retrieve or in-spect any documents or pages from the database, 
                   but rather just exploits the number of matches that each query 
                   probe generates at the database in question. We have conducted an 
                   extensive experimental evaluation of our technique over 
                   collections of real documents, including over one hundred 
                   web-accessible databases. Our experiments show that our system 
                   has low overhead and achieves high classification ac-curacy 
                   across a variety of databases.},
}
@inProceedings{Ittner95,
   author       = {David J. Ittner and Lewis, David D. and David D. Ahn},
   title        = {Text categorization of low quality images},
   booktitle    = {Proceedings of SDAIR-95, 4th Annual Symposium on Document 
                   Analysis and Information Retrieval},
   publisher    = {},
   editor       = {},
   year         = {1995},
   address      = {Las Vegas, {US}},
   pages        = {301--315},
   url          = {http://www.research.att.com/~lewis/papers/ittner95.ps},
   abstract     = {Categorization of text images into content-oriented classes would 
                   be a useful capability in a variety of document handling systems. 
                   Many methods can be used to categorize texts once their words are 
                   known, but OCR can garble a large proportion of words, 
                   particularly when low quality images are used. Despite this, we 
                   show for one data set that fax quality images can be categorized 
                   with nearly the same accuracy as the original text. Further, the 
                   categorization system can be trained on noisy OCR output, without 
                   need for the true text of any image, or for editing of OCR 
                   output. The use of a vector space classifier and training method 
                   robust to large feature sets, combined with discarding of low 
                   frequency OCR output strings are the key to our approach.},
}
@inProceedings{Iwayama94,
   author       = {Makoto Iwayama and Takenobu Tokunaga},
   title        = {A Probabilistic Model for Text Categorization: Based on a Single 
                   Random Variable with Multiple Values},
   booktitle    = {Proceedings of ANLP-94, 4th Conference on Applied Natural 
                   Language Processing},
   publisher    = {Association for Computational Linguistics, Morristown, {US}},
   editor       = {},
   year         = {1994},
   address      = {Stuttgart, {DE}},
   pages        = {162--167},
   url          = {},
   abstract     = {},
}
@inProceedings{Iwayama95,
   author       = {Makoto Iwayama and Takenobu Tokunaga},
   title        = {Cluster-based text categorization: a comparison of category 
                   search strategies},
   booktitle    = {Proceedings of SIGIR-95, 18th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Edward A. Fox and Peter Ingwersen and Raya Fidel},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1995},
   address      = {Seattle, {US}},
   pages        = {273--281},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/215206/p273-iwayama/p273-iwayama.pdf},
   abstract     = {Text categorization can be viewed as a process of category 
                   search, in which one or more categories for a test document are 
                   searched for by using given training documents with known 
                   categories. A cluster based search with a probabilistic 
                   clustering algorithm is proposed and evaluated on two data sets. 
                   The efficiency, effectiveness, and noise tolerance of this search 
                   strategy were confirmed to be better than those of a full search, 
                   a category based search, and a cluster based search with 
                   nonprobabilistic clustering.},
}
@inProceedings{Iwayama95a,
   author       = {Makoto Iwayama and Takenobu Tokunaga},
   title        = {Hierarchical {B}ayesian clustering for automatic text 
                   classification},
   booktitle    = {Proceedings of IJCAI-95, 14th International Joint Conference on 
                   Artificial Intelligence},
   editor       = {Chris E. Mellish},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   year         = {1995},
   address      = {Montreal, {CA}},
   pages        = {1322--1327},
   url          = {},
   abstract     = {Text classification, the grouping of texts into several clusters, 
                   has been used as a means of improving both the efficiency and the 
                   effectiveness of text retrieval/categorization. In this paper we 
                   propose a hierarchical clustering algorithm that constructs a set 
                   of clusters having the maximum Bayesian posterior probability, 
                   the probability that the given texts are classified into 
                   clusters. We call the algorithm Hierarchical Bayesian Clustering 
                   (HBC). The advantages of HBC are experimentally verified from 
                   several viewpoints. HBC can reconstruct the original clusters 
                   more accurately than other non-probabilistic algorithms. When a 
                   probabilistic text categorization is extended to a cluster-based 
                   one, the use of HBC offers better performance than the use of 
                   non-probabilistic algorithms.},
}
@inProceedings{Iwazume96,
   author       = {Michiaki Iwazume and Hideaki Takeda and Toyoaki Nishida},
   title        = {Ontology-Based Information Gathering and Text Categorization from 
                   the {I}nternet},
   booktitle    = {Proceedings of IEA/AIE-96, 9th International Conference in 
                   Industrial and Engineering Applications of Artificial 
                   Intelligence and Expert Systems},
   editor       = {},
   publisher    = {},
   year         = {1996},
   address      = {Fukuoka, {JP}},
   pages        = {305--314},
   url          = {},
   abstract     = {},
}
@inProceedings{Iyer00,
   author       = {Raj D. Iyer and David D. Lewis and Robert E. Schapire and Yoram 
                   Singer and Amit Singhal},
   title        = {Boosting for Document Routing},
   booktitle    = {Proceedings of CIKM-00, 9th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {McLean, {US}},
   editor       = {Arvin Agah and Jamie Callan and Elke Rundensteiner},
   year         = {2000},
   pages        = {70--77},
   url          = {http://www.cs.huji.ac.il/~singer/papers/rankboost.ps.gz},
   abstract     = {RankBoost is a recently proposed algorithm for learning ranking 
                   functions. It is simple to implement and has strong 
                   justifications from computational learning theory. We describe 
                   the algorithm and present experimental results on applying it to 
                   the document routing problem. The first set of results applies 
                   RankBoost to a text representation produced using modern term 
                   weighting meth-ods. Performance of RankBoost is somewhat inferior 
                   to that of a state-of-the-art routing algorithm which is, 
                   however, more com-plex and less theoretically justified than 
                   RankBoost. RankBoost achieves comparable performance to the 
                   state-of-the-art algorithm when combined with feature or example 
                   selection heuristics. Our second set of results examines the 
                   behavior of RankBoost when it has to learn not only a ranking 
                   function but also all aspects of term weighting from raw data. 
                   Performance is usually, though not always, less good here, but 
                   the term weighting functions implicit in the resulting ranking 
                   functions are intriguing, and the approach could easily be 
                   adapted to mixtures of textual and nontextual data.},
}
@inProceedings{Jacobs92,
   author       = {Paul S. Jacobs},
   title        = {Joining statistics with {NLP} for text categorization},
   booktitle    = {Proceedings of ANLP-92, 3rd Conference on Applied Natural 
                   Language Processing},
   publisher    = {Association for Computational Linguistics, Morristown, {US}},
   editor       = {Marcia Bates and Oliviero Stock},
   year         = {1992},
   address      = {Trento, {IT}},
   pages        = {178--185},
   url          = {},
   abstract     = {Automatic news categorization systems have produced high 
                   accuracy, consistency, and flexibility using some natural 
                   language processing techniques. These knowledge-based 
                   categorization methods are more powerful and accurate than 
                   statistical techniques. However, the phrasal pre-processing and 
                   pattern matching methods that seem to work for categorization 
                   have the disadvantage of requiring a fair amount of 
                   knowledge-encoding by human beings. In addition, they work much 
                   better at certain tasks, such as identifying major events in 
                   texts, than at others, such as determining what sort of business 
                   or product is involved in a news event. Statistical methods for 
                   categorization, on the other hand, are easy to implement and 
                   require little or no human customization. But they don't offer 
                   any of the benefits of natural language processing, such as the 
                   ability to identify relationships and enforce linguistic 
                   constraints. The authors' approach has been to use statistics in 
                   the knowledge acquisition component of a linguistic pattern-based 
                   categorization system, using statistical methods, for example, to 
                   associate words with industries and identify phrases that 
                   information about businesses or products. Instead of replacing 
                   knowledge-based methods with statistics, statistical training 
                   replaces knowledge engineering. This has resulted in high 
                   accuracy, shorter customization time, and good prospects for the 
                   application of the statistical methods to problems in lexical 
                   acquisition.},
}
@article{Jacobs93,
   author       = {Paul S. Jacobs},
   title        = {Using Statistical Methods to Improve Knowledge-Based News 
                   Categorization},
   journal      = {{IEEE} Expert},
   year         = {1993},
   number       = {2},
   volume       = {8},
   pages        = {13--23},
   url          = {},
   abstract     = {},
}
@inProceedings{Jo99,
   author       = {Taeho C. Jo},
   title        = {Text categorization with the concept of fuzzy set of informative 
                   keywords},
   booktitle    = {Proceedings of FUZZ-IEEE'99, IEEE International Conference on 
                   Fuzzy Systems},
   editor       = {},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   address      = {Seoul, {KR}},
   pages        = {609--614},
   year         = {1999},
   url          = {},
   abstract     = {Text categorization is the procedure of assigning a category to a 
                   particular document among predefined categories. Informative 
                   keywords are the ones which reflect the contents of a document. A 
                   document includes informative keywords and non-informative 
                   keywords. Mainly non-informative keywords play the roles of 
                   grammatical functions in sentences; such keywords, what are 
                   called functional keywords, reflect its contents very little, so 
                   they should be removed in the process of document indexing. The 
                   discrimination between informative keywords and functional 
                   keywords is not crisp. In the process of document indexing, a 
                   document is represented as a set of informative keywords. In this 
                   paper, it is proposed that a document be represented into a fuzzy 
                   set of informative keywords, instead of a crisp set of 
                   informative keywords. The experiments of the categorization of 
                   news articles show that the proposed schemes of text 
                   categorization outperform the schemes with crisp sets.},
}
@inCollection{Jo99a,
   author       = {Taeho C. Jo},
   title        = {News article classification based on categorical points from 
                   keywords in backdata},
   booktitle    = {Computational Intelligence for Modelling, Control and Automation},
   editor       = {Masoud Mohammadian},
   publisher    = {{IOS} Press},
   address      = {Amsterdam, {NL}},
   pages        = {211--214},
   year         = {1999},
   url          = {},
   abstract     = {A scheme of automatic document classification is presented. 
                   Previously, documents have been classified according to their 
                   contents manually. Therefore, it is very costly to assign a 
                   category to them because a human investigates their contents. As 
                   the amount of data stored in storage media is increased 
                   exponentially, it becomes necessary to store documents according 
                   to their category, to access them easily. Automatic text 
                   classification is needed to store documents like that. Before 
                   performing text classification, back data should be constructed. 
                   The back data stores the information about keywords: the 
                   frequency for each category, the number of documents for each 
                   category. A document is represented with a list of keywords. 
                   Categorical points to each category are computed by summing the 
                   frequency of each keyword from back data, or the number of 
                   documents from it. The category that contains the largest 
                   categorical points is selected as the category of a document. In 
                   the results of an experiment with news article classification, 
                   precision is about 98\%.},
}
@inCollection{Jo99b,
   author       = {Taeho C. Jo},
   title        = {News articles classification based on representative keywords of 
                   categories},
   booktitle    = {Computational Intelligence for Modelling, Control and Automation},
   editor       = {Masoud Mohammadian},
   publisher    = {{IOS} Press},
   address      = {Amsterdam, {NL}},
   pages        = {194--198},
   year         = {1999},
   url          = {},
   abstract     = {A scheme of automatic document classification is presented. So 
                   far, documents have been classified according to their contents 
                   manually. Therefore, it is very costly to assign a category for 
                   them because humans investigate their contents. As the amount of 
                   data stored in storage media is increased exponentially, it 
                   becomes necessary to store documents according to their category, 
                   to access them easily. Automatic text classification is necessary 
                   to store documents like that. The scheme for automatic text 
                   classification proposed in the paper, is based on document 
                   indexing, where a document is represented as a list of keywords. 
                   The number of common keywords between keywords from the document 
                   itself and representative keywords from back data classifies 
                   documents. As an example, the proposed scheme is applied to the 
                   classification of news articles into 3 categories: politics, 
                   sports, and business. The measurements of performance evaluation 
                   are: classification rate, correctness rate, and classified 
                   correctness rate.},
}
@inProceedings{Joachims97,
   author       = {Thorsten Joachims},
   title        = {A probabilistic analysis of the {R}occhio algorithm with {TFIDF} 
                   for text categorization},
   booktitle    = {Proceedings of ICML-97, 14th International Conference on Machine 
                   Learning},
   editor       = {Douglas H. Fisher},
   year         = {1997},
   address      = {Nashville, {US}},
   pages        = {143--151},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www-ai.cs.uni-dortmund.de/DOKUMENTE/joachims_97a.ps.gz},
   abstract     = {The Rocchio relevance feedback algorithm is one of the most 
                   popular and widely applied learning methods from information 
                   retrieval. Here, a probabilistic analysis of this algorithm is 
                   presented in a text categorization framework. The analysis gives 
                   theoretical insight into the heuristics used in the Roc-chio 
                   algorithm, particularly the word weighting scheme and the 
                   similarity metric. It also suggests improvements which lead to a 
                   probabilistic variant of the Rocchio classifier. The Rocchio 
                   classifier, its probabilistic variant, and a naive Bayes 
                   classifier are compared on six text categorization tasks. The 
                   results show that the probabilistic algorithms are preferable to 
                   the heuristic Rocchio classifier not only because they are more 
                   well-founded, but also because they achieve better performance.},
}
@inProceedings{Joachims97b,
   author       = {Thorsten Joachims and Dayne Freitag and Tom M. Mitchell},
   title        = {{\sc WebWatcher}: a tour guide for the {W}ord {W}ide {W}eb},
   booktitle    = {Proceedings of IJCAI-97, 15th International Joint Conference on 
                   Artificial Intelligence},
   editor       = {Martha E. Pollack},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   year         = {1997},
   address      = {Nagoya, {JP}},
   pages        = {770--775},
   url          = {http://www.cs.cmu.edu/afs/cs/user/dayne/www/ps/ijcai97.ps.Z},
   abstract     = {We describe WebWatcher as a tour guide agent for the web, the 
                   learning algorithms used by WebWatcher, experimental results 
                   based on learning from thousands of users, and lessons learned 
                   from this case study of tour guide agents.},
}
@inProceedings{Joachims98,
   author       = {Thorsten Joachims},
   title        = {Text categorization with support vector machines: learning with 
                   many relevant features},
   booktitle    = {Proceedings of ECML-98, 10th European Conference on Machine 
                   Learning},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1398},
   editor       = {Claire N{\'{e}}dellec and C{\'{e}}line Rouveirol},
   address      = {Chemnitz, {DE}},
   pages        = {137--142},
   year         = {1998},
   url          = {http://www-ai.cs.uni-dortmund.de/DOKUMENTE/joachims_98a.ps.gz},
   abstract     = {The paper explores the use of Support Vector Machines (SVMs) for 
                   learning text classifiers from examples. It analyzes the 
                   particular properties of learning with text data and identifies 
                   why SVMs are appropriate for this task. Empirical results support 
                   the theoretical findings. SVMs achieve substantial improvements 
                   over the currently best performing methods and behave robustly 
                   over a variety of different learning tasks. Furthermore, they are 
                   fully automatic, eliminating the need for manual parameter 
                   tuning.},
}
@inProceedings{Joachims99,
   author       = {Thorsten Joachims},
   title        = {Transductive Inference for Text Classification using Support 
                   Vector Machines},
   booktitle    = {Proceedings of ICML-99, 16th International Conference on Machine 
                   Learning},
   editor       = {Ivan Bratko and Saso Dzeroski},
   year         = {1999},
   address      = {Bled, {SL}},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   pages        = {200--209},
   url          = {http://www-ai.cs.uni-dortmund.de/DOKUMENTE/joachims_99c.ps.gz},
   abstract     = {This paper introduces transductive support vector machines 
                   (TSVMs) for text classification. While regular support vector 
                   machines (SVMs) try to induce a general decision function for a 
                   learning task, TSVMs take into account a particular test set and 
                   try to minimize misclassifications of just those particular 
                   examples. The paper presents an analysis of why TSVMs are well 
                   suited for text classification. These theoretical findings are 
                   supported by experiments on three test collections. The 
                   experiments show substantial improvements over inductive methods, 
                   especially for small training sets, cutting the number of labeled 
                   training examples down to a 20th on some tasks. This work also 
                   proposes an algorithm for training TSVMs efficiently, handling 
                   10,000 examples and more.},
}
@inProceedings{Joachims00,
   author       = {Thorsten Joachims},
   title        = {Estimating the Generalization Performance of a {SVM} Efficiently},
   booktitle    = {Proceedings of ICML-00, 17th International Conference on Machine 
                   Learning},
   editor       = {Pat Langley},
   year         = {2000},
   address      = {Stanford, {US}},
   pages        = {431--438},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www-ai.cs.uni-dortmund.de/DOKUMENTE/joachims_00a.pdf},
   abstract     = {This paper proposes and analyzes an efficient and effective 
                   approach for estimating the generalization performance of a 
                   support vector machine (SVM) for text classification. Without any 
                   computation-intensive resampling, the new estimators are 
                   computationally much more efficient than cross-validation or 
                   bootstrapping. They can be computed at essentially no extra cost 
                   immediately after training a single SVM. Moreover, the estimators 
                   developed here address the special performance measures needed 
                   for evaluating text classifiers. They can be used not only to 
                   estimate the error rate, but also to estimate recall, precision, 
                   and F1. A theoretical analysis and experiments show that the new 
                   method can effectively estimate the performance of SVM text 
                   classifiers in an efficient way.},
}
@inProceedings{Joachims01c,
   author       = {Thorsten Joachims},
   title        = {A Statistical Learning Model of Text Classification with Support 
                   Vector Machines},
   booktitle    = {Proceedings of SIGIR-01, 24th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {W. Bruce Croft and David J. Harper and Donald H. Kraft and Justin 
                   Zobel},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {New Orleans, {US}},
   year         = {2001},
   pages        = {128--136},
   url          = {http://www.cs.cornell.edu/People/tj/publications/joachims_01a.pdf},
   abstract     = {This paper develops a theoretical learning model of text 
                   classification for Support Vector Machines (SVMs). It connects 
                   the statistical properties of text-classification tasks with the 
                   generalization performance of a SVM in a quantitative way. Unlike 
                   conventional approaches to learning text classifiers, which rely 
                   primarily on empirical evidence, this model explains why and when 
                   SVMs perform well for text classification. In particular, it 
                   addresses the following questions: Why can support vector 
                   machines handle the large feature spaces in text classification 
                   effectively? How is this related to the statistical properties of 
                   text? What are sufficient conditions for applying SVMs to 
                   text-classification problems successfully?},
}
@inProceedings{Joachims01b,
   author       = {Thorsten Joachims and Nello Cristianini and John Shawe-Taylor},
   title        = {Composite Kernels for Hypertext Categorisation},
   booktitle    = {Proceedings of ICML-01, 18th International Conference on Machine 
                   Learning},
   editor       = {Carla Brodley and Andrea Danyluk},
   address      = {Williams College, {US}},
   year         = {2001},
   pages        = {250--257},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www.cs.cornell.edu/People/tj/publications/joachims_etal_01a.pdf},
   abstract     = {Kernels are problem-specific functions that act as an interface 
                   between the learning system and the data. While it is well-known 
                   when the combination of two kernels is again a valid kernel, it 
                   is an open question if the resulting kernel will perform well. In 
                   particular, in which situations can a combination of kernel be 
                   expected to perform better than its components considered 
                   separately? Intuitively, one would like each of the two kernels 
                   to contribute information that is not available to the other. 
                   This characterization hence must consider the data at hand, both 
                   the kernels and also the task, that is the information given by 
                   the labels. We investigate this problem by looking at the task of 
                   designing kernels for hypertext classification, where both words 
                   and links information can be exploited. Firstly we introduce a 
                   novel kernel, whose Gram matrix is the well known co-citation 
                   matrix from bibliometrics, and demonstrate on real data that it 
                   has a good performance. Then we study the problem of combining it 
                   with a standard bag of words kernel. We provide sufficient 
                   conditions that indicate when an improvement can be expected, 
                   highlighting and formalising the notion of ``independent 
                   kernels''. Experimental results confirm the predictions of the 
                   theory in the hypertext domain.},
}
@book{Joachims02a,
   author       = {Thorsten Joachims},
   title        = {Learning to Classify Text using Support Vector Machines},
   publisher    = {Kluwer Academic Publishers},
   address      = {Dordrecht, {NL}},
   year         = {2002},
}
@article{Joachims02,
   author       = {Thorsten Joachims and Fabrizio Sebastiani},
   title        = {Guest editors' introduction to the special issue on automated 
                   text categorization},
   journal      = {Journal of Intelligent Information Systems},
   year         = {2002},
   note         = {Special Issue on Automated Text Categorization},
   volume       = {18},
   number       = {2/3},
   pages        = {103--105},
   url          = {http://www.wkap.nl/article.pdf?391241},
}
@article{Juan02,
   author       = {Juan, Alfons and Vidal, Enrique},
   title        = {On the use of {B}ernoulli mixture models for text classification},
   journal      = {Pattern Recognition},
   year         = {2002},
   volume       = {35},
   number       = {12},
   pages        = {2705--2710},
   url          = {},
   abstract     = {Mixture modelling of class-conditional densities is a standard 
                   pattern recognition technique. Although most research on mixture 
                   models has concentrated on mixtures for continuous data, emerging 
                   pattern recognition applications demand extending research 
                   efforts to other data types. This paper focuses on the 
                   application of mixtures of multivariate Bernoulli distributions 
                   to binary data. More concretely, a text classification task aimed 
                   at improving language modelling for machine translation is 
                   considered.},
}
@inProceedings{Junker97,
   author       = {Markus Junker and Andreas Abecker},
   title        = {Exploiting Thesaurus Knowledge in Rule Induction for Text 
                   Classification},
   booktitle    = {Proceedings of RANLP-97, 2nd International Conference on Recent 
                   Advances in Natural Language Processing},
   publisher    = {},
   editor       = {Ruslan Milkov and Nicolas Nicolov and Nilokai Nikolov},
   address      = {Tzigov Chark, {BL}},
   pages        = {202--207},
   year         = {1997},
   url          = {http://www.dfki.uni-kl.de/~junker/download/ranlp97.ps},
   abstract     = {Systems for learning text classifiers recently gained 
                   considerable interest. One technique to implement such systems is 
                   rule induction. While most other approaches rely on a relatively 
                   simple document representation and do not make use of any 
                   background knowledge, rule induction algorithms offer a good 
                   potential for improvements in both of these areas. In this paper, 
                   we show how an operator-based view of rule induction enables the 
                   easy integration of a thesaurus as background knowledge. Results 
                   with an algorithm extended by thesaurus knowledge are presented 
                   and interpreted. The interpretation shows the strengths and 
                   weaknesses of using thesaurus knowledge and gives hints for 
                   future research.},
}
@article{Junker98,
   author       = {Markus Junker and Rainer Hoch},
   title        = {An experimental evaluation of {OCR} text representations for 
                   learning document classifiers},
   journal      = {International Journal on Document Analysis and Recognition},
   pages        = {116--122},
   year         = {1998},
   number       = {2},
   volume       = {1},
   url          = {http://link.springer.de/link/service/journals/10032/papers/8001002/80010116.ps.gz},
   abstract     = {In the literature, many feature types are proposed for document 
                   classification. However, an extensive and systematic evaluation 
                   of the various approaches has not yet been done. In particular, 
                   evaluations on OCR documents are very rare. In this paper we 
                   investigate seven text representations based on n-grams and 
                   single words. We compare their effectiveness in classifying OCR 
                   texts and the corresponding correct ASCII texts in two domains: 
                   business letters and abstracts of technical reports. Our results 
                   indicate that the use of n-grams is an attractive technique which 
                   can even compare to techniques relying on a morphological 
                   analysis. This holds for OCR texts as well as for correct ASCII 
                   texts.},
}
@inProceedings{Junker00,
   author       = {Markus Junker and Michaell Sintek and Matthias Rinck},
   title        = {Learning for text categorization and information extraction with 
                   ILP},
   booktitle    = {Proceedings of the 1st Workshop on Learning Language in Logic},
   editor       = {Cussens, James and Saso Dzeroski},
   year         = {2000},
   address      = {Bled, {SL}},
   pages        = {247--258},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1925},
   url          = {},
   abstract     = {Text categorization (TC) and information extraction (IE) are two 
                   important goals of natural language processing. While 
                   hand-crafting rules for both tasks has a long tradition, learning 
                   approaches used to gain much interest in the past. Since in both 
                   tasks text as a sequence of words is of crucial importance, 
                   propositional learners have strong limitations, Although viewing 
                   learning for TC and IE as inductive logic programming (ILP) 
                   problems is obvious, most approaches rather use proprietary 
                   formalisms. In this paper, we provide a solid basis for the 
                   application of ILP methods to these learning problems. We 
                   introduce three basic types (namely a type for text, one for 
                   words and one for positions in texts) and three simple predicate 
                   definitions over these types which enable us to write TC and IE 
                   rules as logic programs. Based on the proposed representation, we 
                   present an approach to the problem of learning rules for TC and 
                   IE in terms of ILP. We conclude by comparing our approach of 
                   representing texts and rules as logic programs to others.},
}
@inProceedings{Junker01,
   author       = {Markus Junker and Andreas Dengel},
   title        = {Preventing Overfitting in Learning Text Patterns for Document 
                   Categorization},
   booktitle    = {Proceedings of ICAPR-01, 2nd International Conference on Advances 
                   in Pattern Recognition},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2013},
   editor       = {Sameer Singh and Nabeel A. Murshed and Walter Kropatsch},
   address      = {Rio De Janeiro, {BR}},
   year         = {2001},
   pages        = {137--146},
   url          = {http://link.springer.de/link/service/series/0558/papers/2013/20130137.pdf},
   abstract     = {There is an increasing interest in categorizing texts using 
                   learning algorithms. While the majority of approaches rely on 
                   learning linear classifiers, there is also some interest in 
                   describing document categories by text patterns. We introduce a 
                   model for learning patterns for text categorization (the 
                   LPT-model) that does not rely on an attribute-value 
                   representation of documents but represents documents essentially 
                   "as they are". Based on the LPT-model, we focus on learning 
                   patterns within a relatively simple pattern language. We compare 
                   different search heuristics and pruning methods known from 
                   various symbolic rule learners on a set of representative text 
                   categorization problems. The best results were obtained using the 
                   m-estimate as search heuristics combined with the 
                   likelihood-ratio-statics for pruning. Even better results can be 
                   obtained, when replacing the likelihood-ratio-statics by a new 
                   measure for pruning; this we call l-measure. In contrast to 
                   conventional measures for pruning, the l-measure takes into 
                   account properties of the search space.},
}
@article{Kaban02,
   author       = {Ata Kaban and Mark Girolami},
   title        = {A Dynamic Probabilistic Model to Visualise Topic Evolution in 
                   Text Streams},
   journal      = {Journal of Intelligent Information Systems},
   year         = {2002},
   note         = {Special Issue on Automated Text Categorization},
   volume       = {18},
   number       = {2/3},
   pages        = {107--125},
   url          = {http://www.wkap.nl/article.pdf?391242},
   abstract     = {We propose a novel probabilistic method, based on latent variable 
                   models, for unsupervised topographic visualisation of dynamically 
                   evolving, coherent textual information. This can be seen as a 
                   complementary tool for topic detection and tracking applications. 
                   This is achieved by the exploitation of the a priori domain 
                   knowledge available, that there are relatively homogeneous 
                   temporal segments in the data stream. In a different manner from 
                   topographical techniques previously utilized for static text 
                   collections, the topography is an outcome of the coherence in 
                   time of the data stream in the proposed model. Simulation results 
                   on both toy-data settings and an actual application on Internet 
                   chat line discussion analysis is presented by way of 
                   demonstration.},
}
@article{Kar78,
   author       = {Gautam Kar and Lee J. White},
   title        = {A distance measure for automated document classification by 
                   sequential analysis},
   journal      = {Information Processing and Management},
   pages        = {57--69},
   year         = {1978},
   number       = {2},
   volume       = {14},
   url          = {},
   abstract     = {},
}
@inProceedings{Karypis00,
   author       = {George Karypis and Eui-Hong Han},
   title        = {Fast Supervised Dimensionality Reduction Algorithm with 
                   Applications to Document Categorization and Retrieval},
   booktitle    = {Proceedings of CIKM-00, 9th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {McLean, {US}},
   editor       = {Arvin Agah and Jamie Callan and Elke Rundensteiner},
   year         = {2000},
   pages        = {12--19},
   url          = {ftp://ftp.cs.umn.edu/dept/users/kumar/cikm-ci.ps},
   abstract     = {Retrieval techniques based on dimensionality reduction, such as 
                   Latent Semantic Indexing (LSI), have been shown to improve the 
                   quality of the information being retrieved by capturing the 
                   latent meaning of the words present in the documents. 
                   Unfortunately, the high computational and memory requirements of 
                   LSI and its inability to compute an effective dimensionality 
                   reduction in a supervised setting limits its applicability. In 
                   this paper we present a fast supervised dimensionality reduction 
                   algorithm that is derived from the recently developed 
                   cluster-based unsupervised dimensionality reduction algorithms. 
                   We experimentally evaluate the quality of the lower dimensional 
                   spaces both in the context of document categorization and 
                   improvements in retrieval performance on a variety of different 
                   document collections. Our experiments show that the lower 
                   dimensional spaces computed by our algorithm consistently improve 
                   the performance of traditional algorithms such as C4.5, 
                   k-nearest-neighbor, and Support Vector Machines (SVM), by an 
                   average of 2\% to 7\%. Furthermore, the supervised lower 
                   dimensional space greatly improves the retrieval performance when 
                   compared to LSI.},
}
@inProceedings{Kawatani02,
   author       = {Takahiko Kawatani},
   title        = {Topic Difference Factor Extraction between Two Document Sets and 
                   its Application to Text Categorization},
   booktitle    = {Proceedings of SIGIR-02, 25th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng 
                   and Kalervo J{\"{a}}rvelin},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Tampere, {FI}},
   year         = {2002},
   pages        = {137--144},
   url          = {http://doi.acm.org/10.1145/564376.564402},
   abstract     = {To improve performance in text categorization, it is important to 
                   extract distinctive features for each class. This paper proposes 
                   topic difference factor analysis (TDFA) as a method to extract 
                   projection axes that reflect topic differences between two 
                   document sets. Suppose all sentence vectors that compose each 
                   document are projected onto projection axes. TDFA obtains the 
                   axes that maximize the ratio between the document sets as to the 
                   sum of squared projections by solving a generalized eigenvalue 
                   problem. The axes are called topic difference factors (TDF's). By 
                   applying TDFA to the document set that belongs to a given class 
                   and a set of documents that is misclassified as belonging to that 
                   class by an existent classifier, we can obtain features that take 
                   large values in the given class but small ones in other classes, 
                   as well as features that take large values in other classes but 
                   small ones in the given class. A classifier was constructed 
                   applying the above features to complement the kNN classifier. As 
                   the results, the micro averaged F1 measure for Reuters-21578 
                   improved from 83.69 to 87.27\%.},
}
@inProceedings{Kessler97,
   author       = {Brett Kessler and Geoff Nunberg and Hinrich Sch{\"{u}}tze},
   title        = {Automatic detection of text genre},
   booktitle    = {Proceedings of ACL-97, 35th Annual Meeting of the Association for 
                   Computational Linguistics},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   editor       = {Philip R. Cohen and Wolfgang Wahlster},
   year         = {1997},
   address      = {Madrid, {ES}},
   pages        = {32--38},
   url          = {ftp://parcftp.xerox.com/pub/qca/genre/paper.acl97.ps.Z},
   abstract     = {As the text databases available to users become larger and more 
                   heterogeneous, genre becomes increasingly important for 
                   computational linguistics as a complement to topical and 
                   structural principles of classification. We propose a theory of 
                   genres as bundles of facets, which correlate with various surface 
                   cues, and argue that genre detection based on surface cues is as 
                   successful as detection based on deeper structural properties.},
}
@inProceedings{Khmelev03,
   author       = {Dmitry V. Khmelev and William J. Teahan},
   title        = {A repetition based measure for verification of text collections 
                   and for text categorization},
   booktitle    = {Proceedings of SIGIR-03, 26th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Jamie Callan and Gordon Cormack and Charles Clarke and David 
                   Hawking and Alan Smeaton},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Toronto, {CA}},
   year         = {2003},
   pages        = {104--110},
   url          = {http://doi.acm.org/10.1145/860435.860456},
   abstract     = {We suggest a way for locating duplicates and plagiarisms in a 
                   text collection using an R-measure, which is the normalized sum 
                   of the lengths of all suffixes of the text repeated in other 
                   documents of the collection. The R-measure can be effectively 
                   computed using the suffix array data structure. Additionally, the 
                   computation procedure can be improved to locate the sets of 
                   duplicate or plagiarised documents. We applied the technique to 
                   several standard text collections and found that they contained a 
                   significant number of duplicate and plagiarised documents. 
                   Another reformulation of the method leads to an algorithm that 
                   can be applied to supervised multi-class categorization. We 
                   illustrate the approach using the recently available Reuters 
                   Corpus Volume 1 (RCV1). The results show that the method 
                   outperforms SVM at multi-class categorization, and interestingly, 
                   that results correlate strongly with compression-based methods.},
}
@inProceedings{Kim00,
   author       = {Yu-Hwan Kim and Shang-Yoon Hahn and Byoung-Tak Zhang},
   title        = {Text filtering by boosting naive {B}ayes classifiers},
   booktitle    = {Proceedings of SIGIR-00, 23rd ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Athens, {GR}},
   year         = {2000},
   pages        = {168--75},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/345508/p168-kim/p168-kim.pdf},
   abstract     = {Several machine learning algorithms have recently been used for 
                   text categorization and filtering. In particular, boosting 
                   methods such as AdaBoost have shown good performance applied to 
                   real text data. However, most of existing boosting algorithms are 
                   based on classifiers that use binary-valued features. Thus, they 
                   do not fully make use of the weight information provided by 
                   standard term weighting methods. In this paper, we present a 
                   boosting-based learning method for text filtering that uses naive 
                   Bayes classifiers as a weak learner. The use of naive Bayes 
                   allows the boosting algorithm to utilize term frequency 
                   information while maintaining probabilistically accurate 
                   confidence ratio. Applied to TREC-7 and TREC-8 filtering track 
                   documents, the proposed method obtained a significant improvement 
                   in LF1, LF2, Fl and F3 measures compared to the best results 
                   submitted by other TREC entries.},
}
@inProceedings{Kindermann01,
   author       = {J{\"{o}}rg Kindermann and Gerhard Paa{{\ss}} and Edda Leopold},
   title        = {Error Correcting Codes with Optimized {K}ullback-{L}eibler 
                   Distances for Text Categorization},
   booktitle    = {Proceedings of ECML-01, 12th European Conference on Machine 
                   Learning},
   editor       = {Luc De Raedt and Arno Siebes},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Freiburg, {DE}},
   year         = {2001},
   pages        = {266--275},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2168},
   url          = {http://link.springer.de/link/service/series/0558/papers/2168/21680266.pdf},
   abstract     = {We extend a multi-class categorization scheme proposed by 
                   Dietterich and Bakiri 1995 for binary classifiers, using error 
                   correcting codes. The extension comprises the computation of the 
                   codes by a simulated annealing algorithm and optimization of 
                   Kullback-Leibler (KL) category distances within the code-words. 
                   For the first time, we apply the scheme to text categorization 
                   with support vector machines (SVMs) on several large text corpora 
                   with more than 100 categories. The results are compared to 1-of-N 
                   coding (i.e.\ one SVM for each text category). We also 
                   investigate codes with optimized KL distance between the text 
                   categories which are merged in the code-words. We find that error 
                   correcting codes perform better than 1-of-N coding with 
                   increasing code length. For very long codes, the performance is 
                   in some cases further improved by KL-distance optimization.},
}
@inProceedings{Klas00,
   author       = {Klas, Claus-Peter and Fuhr, Norbert},
   title        = {A new Effective Approach for Categorizing {W}eb Documents},
   booktitle    = {Proceedings of BCSIRSG-00, the 22nd Annual Colloquium of the 
                   British Computer Society Information Retrieval Specialist Group},
   editor       = {},
   address      = {Cambridge, {UK}},
   year         = {2000},
   pages        = {},
   publisher    = {},
   url          = {http://ls6-www.informatik.uni-dortmund.de/bib/fulltext/ir/Klas_Fuhr:00.ps.gz},
   abstract     = {Categorization of Web documents poses a new challenge for 
                   automatic classification methods. In this paper, we present the 
                   megadocument approach for categorization. For each category, all 
                   corresponding document texts from the training sample are 
                   concatenated to a megadocument, which is indexed using standard 
                   methods. In order to classify a new document, the most similar 
                   megadocument determines the category to be assigned. Our 
                   evaluations show that for Web collections, the megadocument 
                   method clearly outperformes other classification methods. In 
                   contrast, for the Reuters collection, we only achieve mediocre 
                   results. Thus, our method seems to be well suited for 
                   heterogeneous document collections.},
}
@article{Klingbiel73,
   author       = {Paul H. Klingbiel},
   title        = {Machine-aided indexing of technical literature},
   journal      = {Information Storage and Retrieval},
   year         = {1973},
   volume       = {9},
   number       = {2},
   pages        = {79--84},
   url          = {},
   abstract     = {To index successfully in the Defense Documentation Center's 
                   environment, an automated system must chose single words or 
                   phrases (dependent upon context) rapidly and economically. The 
                   automation of DDC's indexing has been machine-aided from its 
                   inception. A machine-aided indexing (MAI) system is described 
                   that indexes one million words of text per hour of CPU time. 
                   Grammatical errors do not exceed five per cent of the output, so 
                   human screening is satisfactorily low. The system could 
                   potentially scale up to an operational size of 10 million words 
                   of text per YEAR - the equivalent of a dozen bibles or a third of 
                   the Encyclopedia Britannica. In a batch mode, the programs to 
                   accomplish this indexing would require no more than fifteen 
                   minutes of CPU time per week.},
}
@article{Klingbiel73a,
   author       = {Paul H. Klingbiel},
   title        = {A technique for machine-aided indexing},
   journal      = {Information Storage and Retrieval},
   year         = {1973},
   volume       = {9},
   number       = {9},
   pages        = {477--494},
   url          = {},
   abstract     = {Subject indexing of text can, in principle, be accomplished in 
                   many ways. The technique for machine-aided indexing (MAI) 
                   developed at the Defense Documentation Center (DDC) is 
                   illustrated on a randomly chosen abstract. Additional text is 
                   provided in coded form so that the reader can more fully explore 
                   this technique and form his own opinion of the applicability and 
                   versatility of this particular procedure. The DDC method for 
                   subject indexing is very close to operational status for a data 
                   base which grows at the rate of two million words of text per 
                   YEAR.},
}
@inProceedings{Klinkenberg00,
   author       = {Ralf Klinkenberg and Thorsten Joachims},
   title        = {Detecting concept drift with support vector machines},
   booktitle    = {Proceedings of ICML-00, 17th International Conference on Machine 
                   Learning},
   editor       = {Pat Langley},
   year         = {2000},
   address      = {Stanford, {US}},
   pages        = {487--494},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www-ai.cs.uni-dortmund.de/DOKUMENTE/klinkenberg_joachims_2000a.pdf.gz},
   abstract     = {For many learning tasks where data is collected over an extended 
                   period of time, its underlying distribution is likely to change. 
                   A typical example is information filtering, i.e. the adaptive 
                   classification of documents with respect to a particular user 
                   interest. Both the interest of the user and the document content 
                   change over time. A filtering system should be able to adapt to 
                   such concept changes. This paper proposes a new method to 
                   recognize and handle concept changes with support vector 
                   machines. The method maintains a window on the training data. The 
                   key idea is to automatically adjust the window size so that the 
                   estimated generalization error is minimized. The new approach is 
                   both theoretically well-founded as well as effective and 
                   efficient in practice. Since it does not require complicated 
                   parameterization, it is simpler to use and more robust than 
                   comparable heuristics. Experiments with simulated concept drift 
                   scenarios based on real-world text data compare the new method 
                   with other window management approaches. We show that it can 
                   effectively select an appropriate window size in a robust way.},
}
@inProceedings{Knorz82,
   author       = {Knorz, Gerhard},
   title        = {A decision theory approach to optimal automated indexing},
   booktitle    = {Proceedings of SIGIR-82, 5th ACM International Conference on 
                   Research and Development in Information Retrieval},
   year         = {1982},
   editor       = {Gerard Salton and Hans-Jochen Schneider},
   pages        = {174--193},
   address      = {Berlin, {DE}},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 146},
   url          = {},
   abstract     = {},
}
@inProceedings{Ko00,
   author       = {Youngjoong Ko and Jungyun Seo},
   title        = {Automatic Text Categorization by Unsupervised Learning},
   booktitle    = {Proceedings of COLING-00, the 18th International Conference on 
                   Computational Linguistics},
   year         = {2000},
   editor       = {},
   pages        = {},
   address      = {Saarbr{\"{u}}cken, {DE}},
   url          = {http://nlp3.korea.ac.kr/proceeding/coling2000/COLING/ps/066.ps},
   abstract     = {The goal of text categorization is to classify documents into a 
                   certain number of pre-defined categories. The previous works in 
                   this area have used a large number of labeled training documents 
                   for supervised learning. One problem is that it is difficult to 
                   create the labeled training documents. While it is easy to 
                   collect the unlabeled documents, it is not so easy to manually 
                   categorize them for creating training documents. In this paper, 
                   we propose an unsupervised learning method to overcome these 
                   difficulties. The proposed method divides the documents into 
                   sentences, and categorizes each sentence using keyword lists of 
                   each category and sentence similarity measure. And then, it uses 
                   the categorized sentences for training. The proposed method shows 
                   a similar degree of performance, compared with the traditional 
                   supervised learning methods. Therefore, this method can be used 
                   in areas where low-cost text categorization is needed. It also 
                   can be used for creating training documents.},
}
@inProceedings{Ko02,
   author       = {Youngjoong Ko and Jinwoo Park and Jungyun Seo},
   title        = {Automatic Text Categorization using the Importance of Sentences},
   booktitle    = {Proceedings of COLING-02, the 19th International Conference on 
                   Computational Linguistics},
   year         = {2002},
   editor       = {},
   pages        = {},
   address      = {Taipei, {TW}},
   url          = {http://acl.ldc.upenn.edu/coling2002/proceedings/data/area-28/co-201.pdf},
   abstract     = {This paper proposes a new approach for text categorization, based 
                   on a feature projection technique. In our approach, training data 
                   are represented as the projections of training documents on each 
                   feature. The voting for a classification is processed on the 
                   basis of individual feature projections. The final classification 
                   of test documents is determined by a majority voting from the 
                   individual classifications of each feature. Our empirical results 
                   show that the proposed approach, Text Categorization using 
                   Feature Projections (TCFP), outperforms k-NN, Rocchio, and Naïve 
                   Bayes. Most of all, TCFP is about one hundred times faster than 
                   k-NN. Since TCFP algorithm is very simple, its implementation and 
                   training process can be done very easily. For these reasons, TCFP 
                   can be a useful classifier in the areas, which need a fast and 
                   high-performance text categorization task.},
}
@inProceedings{Ko02a,
   author       = {Youngjoong Ko and Jungyun Seo},
   title        = {Text Categorization using Feature Projections},
   booktitle    = {Proceedings of COLING-02, the 19th International Conference on 
                   Computational Linguistics},
   year         = {2002},
   editor       = {},
   pages        = {},
   address      = {Taipei, {TW}},
   url          = {http://acl.ldc.upenn.edu/coling2002/proceedings/data/area-28/co-269.pdf},
   abstract     = {Automatic text categorization is a problem of automatically 
                   assigning text documents to predefined categories. In order to 
                   classify text documents, we must extract good features from them. 
                   In previous research, a text document is commonly represented by 
                   the term frequency and the inverted document frequency of each 
                   feature. Since there is a difference between important sentences 
                   and unimportant sentences in a document, the features from more 
                   important sentences should be considered more than other 
                   features. In this paper, we measure the importance of sentences 
                   using text summarization techniques. Then a document is 
                   represented as a vector of features with different weights 
                   according to the importance of each sentence. To verify our new 
                   method, we conducted experiments on two language newsgroup data 
                   sets: one written by English and the other written by Korean. 
                   Four kinds of classifiers were used in our experiments: Naïve 
                   Bayes, Rocchio, k-NN, and SVM. We observed that our new method 
                   made a significant improvement in all classifiers and both data 
                   sets.},
}
@inProceedings{Koehn02,
   author       = {Philipp Koehn},
   title        = {Combining Multiclass Maximum Entropy Text Classifiers with Neural 
                   Network Voting},
   booktitle    = {Proceedings of PorTAL-02, 3rd International Conference on 
                   Advances in Natural Language Processing},
   year         = {2002},
   editor       = {Elisabete Ranchod and Nuno J. Mamede},
   pages        = {125--132},
   address      = {Faro, {PT}},
   url          = {http://link.springer.de/link/service/series/0558/papers/2389/23890125.pdf},
   abstract     = {We improve a high-accuracy maximum entropy classifier by 
                   combining an ensemble of classifiers with neural network voting. 
                   In our experiments we demonstrate significantly superior 
                   performance both over a single classifier as well as over the use 
                   of the traditional weighted-sum voting approach. Specifically, we 
                   apply this to a maximum entropy classifier on a large scale 
                   multi-class text categorization task: the online job directory 
                   Flipdog with over half a million jobs in 65 categories.},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2389},
}
@inProceedings{Kolcz01,
   author       = {Aleksander Kolcz and Vidya Prabakarmurthi and Jugal K. Kalita},
   title        = {String Match and Text Extraction: Summarization as feature 
                   selection for text categorization},
   booktitle    = {Proceedings of CIKM-01, 10th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {Henrique Paques and Ling Liu and David Grossman},
   year         = {2001},
   address      = {Atlanta, {US}},
   pages        = {365--370},
   url          = {http://doi.acm.org/10.1145/502585.502647},
   abstract     = {We address the problem of evaluating the effectiveness of 
                   summarization techniques for the task of document categorization. 
                   It is argued that for a large class of automatic categorization 
                   algorithms, extraction-based document categorization can be 
                   viewed as a particular form of feature selection performed on the 
                   full text of the document and, in this context, its impact can be 
                   compared with state-of-the-art feature selection techniques 
                   especially devised to provide good categorization performance. 
                   Such a framework provides for a better assessment of the expected 
                   performance of a categorizer if the compression rate of the 
                   summarizer is known.},
}
@inProceedings{Koller97,
   author       = {Daphne Koller and Mehran Sahami},
   title        = {Hierarchically classifying documents using very few words},
   booktitle    = {Proceedings of ICML-97, 14th International Conference on Machine 
                   Learning},
   editor       = {Douglas H. Fisher},
   year         = {1997},
   address      = {Nashville, {US}},
   pages        = {170--178},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://robotics.stanford.edu/users/sahami/papers-dir/ml97-hier.ps},
   abstract     = {The proliferation of topic hierarchies for text documents has 
                   resulted in a need for tools that automatically classify new 
                   documents within such hierarchies. Existing classification 
                   schemes which ignore the hierarchical structure and treat the 
                   topics as separate classes are often inadequate in text 
                   classification where the there is a large number of classes and a 
                   huge number of relevant features needed to distinguish between 
                   them. We propose an approach that utilizes the hierarchical topic 
                   structure to decompose the classification task into a set of 
                   simpler problems, one at each node in the classification tree. As 
                   we show, each of these smaller problems can be solved accurately 
                   by focusing only on a very small set of features, those relevant 
                   to the task at hand. This set of relevant features varies widely 
                   throughout the hierarchy, so that, while the overall relevant 
                   feature set may be large, each classifier only examines a small 
                   subset. The use of reduced feature sets allows us to utilize more 
                   complex (probabilistic) models, without encountering many of the 
                   standard computational and robustness difficulties.},
}
@inProceedings{Kongovi02,
   author       = {Madhusudhan Kongovi and Juan Carlos Guzman and Venu Dasigi},
   title        = {Text Categorization: An experiment using Phrases},
   booktitle    = {Proceedings of ECIR-02, 24th European Colloquium on Information 
                   Retrieval Research},
   editor       = {Fabio Crestani and Mark Girolami and Cornelis J. Van Rijsbergen},
   year         = {2002},
   address      = {Glasgow, {UK}},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2291},
   pages        = {213--228},
   url          = {http://link.springer.de/link/service/series/0558/papers/2291/22910213.pdf},
   abstract     = {Typical text classifiers learn from example and training 
                   documents that have been manually categorized. In this research, 
                   our experiment dealt with the classification of news wire 
                   articles using category profiles. We built these profiles by 
                   selecting feature words and phrases from the training documents. 
                   For our experiments we decided on using the text corpus 
                   Reuters-21578. We used precision and recall to measure the 
                   effectiveness of our classifier. Though our experiments with 
                   words yielded good results, we found instances where the 
                   phrase-based approach produced more effectiveness. This could be 
                   due to the fact that when a word along with its adjoining word - 
                   a phrase - is considered towards building a category profile, it 
                   could be a good discriminator. This tight packaging of word pairs 
                   could bring in some semantic value. The packing of word pairs 
                   also filters out words occurring frequently in isolation that do 
                   not bear much weight towards characterizing that category.},
}
@article{Koppel02,
   author       = {Koppel, Moshe and Argamon, Shlomo and Shimoni, Anat R.},
   title        = {Automatically categorizing written texts by author gender},
   journal      = {Literary and Linguistic Computing},
   year         = {2002},
   number       = {4},
   volume       = {17},
   pages        = {401--412},
   url          = {http://www3.oup.co.uk/litlin/hdb/Volume_17/Issue_04/pdf/170401.pdf
                   },
   abstract     = {The problem of automatically determining the gender of a 
                   document's author would appear to be a more subtle problem than 
                   those of categorization by topic or authorship attribution. 
                   Nevertheless, it is shown that automated text categorization 
                   techniques can exploit combinations of simple lexical and 
                   syntactic features to infer the gender of the author of an unseen 
                   formal written document with approximately 80 per cent accuracy. 
                   The same techniques can be used to determine if a document is 
                   fiction or non-fiction with approximately 98 per cent accuracy.},
}
@inProceedings{Kosmynin96,
   author       = {Arkadi Kosmynin and Ian Davidson},
   title        = {Using background contextual knowledge for documents 
                   representation},
   booktitle    = {Proceedings of PODP-96, 3rd International Workshop on Principles 
                   of Document Processing},
   editor       = {Charles K. Nicholas and Derick Wood},
   year         = {1996},
   address      = {Palo Alto, {CA}},
   pages        = {123--133},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1293},
   url          = {},
   abstract     = {We describe our approach to document representation that captures 
                   contextual dependencies between terms in a corpus and makes use 
                   of these dependencies to represent documents. We have tried our 
                   representation scheme for automatic document categorisation on 
                   the Reuters' test set of documents. We achieve a precision recall 
                   break even point of 84\% which is comparable to the best known 
                   published results. Our approach acts as a feature selection 
                   technique that is an alternative to applying the techniques from 
                   machine learning and numerical taxonomy.},
}
@article{Krier02,
   author       = {Marc Krier and Francesco Zacc{\`a}},
   title        = {Automatic categorization applications at the {E}uropean {P}atent 
                   {O}ffice},
   journal      = {World Patent Information},
   year         = {2002},
   volume       = {24},
   number       = {},
   pages        = {187--196},
   url          = {},
   abstract     = {},
}
@inProceedings{Krishnapuram03,
   author       = {Raghu Krishnapuram and Krishna Chitrapura and Sachindra Joshi},
   title        = {Classification of Text Documents Based on Minimum System Entropy},
   booktitle    = {Proceedings of ICML-03, 20th International Conference on Machine 
                   Learning},
   editor       = {},
   year         = {2003},
   address      = {Washington, {DC}},
   pages        = {},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {},
   abstract     = {},
}
@inProceedings{Kwok98,
   author       = {James T. Kwok},
   title        = {Automated text categorization using support vector machine},
   booktitle    = {Proceedings of ICONIP'98, 5th International Conference on Neural 
                   Information Processing},
   editor       = {},
   year         = {1998},
   address      = {Kitakyushu, {JP}},
   pages        = {347--351},
   url          = {http://www.comp.hkbu.edu.hk/7Ejamesk/papers/iconip98.ps.gz},
   abstract     = {In this paper, we study the use of support vector machine in text 
                   categorization. Unlike other machine learning techniques, it 
                   allows easy incorporation of new documents into an existing 
                   trained system. Moreover, dimension reduction, which is usually 
                   imperative, now becomes optional. Thus, SVM adapts efficiently in 
                   dynamic environments that require frequent additions to the 
                   document collection. Empirical results on the Reuters-22173 
                   collection are also discussed.},
}
@inProceedings{Kwon99,
   author       = {Oh-Woog Kwon and Sung-Hwa Jung and Jong-Hyeok Lee and Geunbae Lee},
   title        = {Evaluation of Category Features and Text Structural Information 
                   on a Text Categorization Using Memory Based Reasoning},
   booktitle    = {Proceedings of ICCPOL-99, 18th International Conference on 
                   Computer Processing of Oriental Languages},
   editor       = {},
   year         = {1999},
   address      = {Tokushima, {JP}},
   pages        = {153--158},
   url          = {},
   abstract     = {},
}
@article{Kwon03,
   author       = {Oh-Woog Kwon and Jong-Hyeok Lee},
   title        = {Text categorization based on {k}-nearest neighbor approach for 
                   {W}eb site classification},
   journal      = {Information Processing and Management},
   year         = {2003},
   volume       = {39},
   number       = {1},
   pages        = {25--44},
   url          = {},
   abstract     = {},
}
@inProceedings{Labrou99,
   author       = {Yannis Labrou and Tim Finin},
   title        = {{{\sc Yahoo!}} as an ontology: using {{\sc Yahoo!}}\ categories 
                   to describe documents},
   booktitle    = {Proceedings of CIKM-99, 8th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {},
   year         = {1999},
   address      = {Kansas City, {US}},
   pages        = {180--187},
   url          = {http://www.acm.org/pubs/articles/proceedings/cikm/319950/p180-labrou/p180-labrou.pdf},
   abstract     = {We suggest that one (or a collection) of names of {{\sc Yahoo!}}\ 
                   (or any other WWW indexer¹s) categories can be used to describe 
                   the content of a document. Such categories offer a standardized 
                   and universal way for referring to or describing the nature of 
                   real world objects, activities, documents and so on, and may be 
                   used (we suggest) to semantically characterize the content of 
                   documents. WWW indices, like {{\sc Yahoo!}}\ provide a huge 
                   hierarchy of categories (topics) that touch every aspect of human 
                   endeavors. Such topics can be used as descriptors, similarly to 
                   the way librarians use for example, the Library of Congress 
                   cataloging system to annotate and categorize books. In the course 
                   of investigating this idea, we address the problem of automatic 
                   categorization of webpages in the {{\sc Yahoo!}}\ directory. We 
                   use Telltale as our classifier; Telltale uses n-grams to compute 
                   the similarity between documents. We experiment with various 
                   types of descriptions for the {{\sc Yahoo!}}\ categories and the 
                   webpages to be categorized. Our findings suggest that the best 
                   results occur when using the very brief descriptions of the {{\sc 
                   Yahoo!}}\ categorized entries; these brief descriptions are 
                   provided either by the entries¹ submitters or by the {{\sc 
                   Yahoo!}}\ human indexers and accompany most {{\sc 
                   Yahoo!}}\-indexed entries.},
}
@inProceedings{Lai01,
   author       = {Kwok-Yin Lai and Wai Lam},
   title        = {Meta-learning Models for Automatic Textual Document 
                   Categorization},
   booktitle    = {Proceedings of PAKDD-01, 5th Pacific-Asia Conferenece on 
                   Knowledge Discovery and Data Mining},
   editor       = {David Cheung and Qing Li and Graham Williams},
   year         = {2001},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Hong Kong, {CN}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2035},
   pages        = {78--89},
   url          = {http://link.springer.de/link/service/series/0558/papers/2035/20350078.pdf},
   abstract     = {We investigate two meta-model approaches for the task of 
                   automatic textual document categorization. The first approach is 
                   the linear combination approach. Based on the idea of distilling 
                   the characteristics of how we estimate the merits of each 
                   component algorithm, we propose three different strategies for 
                   the linear combination approach. The linear combination approach 
                   makes use of limited knowledge in the training document set. To 
                   address this limitation, we propose the second meta-model 
                   approach, called Meta-learning Using Document Feature 
                   characteristics (MUDOF), which employs a meta-learning phase 
                   using document feature characteristics. Document feature 
                   characteristics, derived from the training document set, capture 
                   some inherent properties of a particular category. Extensive 
                   experiments have been conducted on a real-world document 
                   collection and satisfactory performance is obtained.},
}
@article{Lai02,
   author       = {Yu-Sheng Lai and Chung-Hsien Wu},
   title        = {COLUMN: Meaningful term extraction and discriminative term 
                   selection in text categorization via unknown-word methodology},
   journal      = {{ACM} Transactions on Asian Language Information Processing},
   volume       = {1},
   number       = {1},
   pages        = {34--64},
   year         = {2002},
   url          = {http://doi.acm.org/10.1145/509900.509904},
   abstract     = {In this article, an approach based on unknown words is proposed 
                   for meaningful term extraction and discriminative term selection 
                   in text categorization. For meaningful term extraction, a 
                   phrase-like unit (PLU)-based likelihood ratio is proposed to 
                   estimate the likelihood that a word sequence is an unknown word. 
                   On the other hand, a discriminative measure is proposed for term 
                   selection and is combined with the PLU-based likelihood ratio to 
                   determine the text category. We conducted several experiments on 
                   a news corpus, called MSDN. The MSDN corpus is collected from an 
                   online news Website maintained by the Min-Sheng Daily News, 
                   Taiwan. The corpus contains 44,675 articles with over 35 million 
                   words. The experimental results show that the system using a 
                   simple classifier achieved 95.31\% accuracy. When using a 
                   state-of-the-art classifier, kNN, the average accuracy is 
                   96.40\%, outperforming all the other systems evaluated on the 
                   same collection, including the traditional term-word by kNN 
                   (88.52\%); sleeping-experts (82.22\%); sparse phrase by four-word 
                   sleeping-experts (86.34\%); and Boolean combinations of words by 
                   RIPPER (87.54\%). A proposed purification process can effectively 
                   reduce the dimensionality of the feature space from 50,576 terms 
                   in the word-based approach to 19,865 terms in the unknown 
                   word-based approach. In addition, more than 80\% of automatically 
                   extracted terms are meaningful. Experiments also show that the 
                   proportion of meaningful terms extracted from training data is 
                   relative to the classification accuracy in outside testing.},
}
@inProceedings{Lam99,
   author       = {Savio L. Lam and Dik L. Lee},
   title        = {Feature Reduction for Neural Network Based Text Categorization},
   booktitle    = {Proceedings of DASFAA-99, 6th IEEE International Conference on 
                   Database Advanced Systems for Advanced Application},
   editor       = {Arbee L. Chen and Frederick H. Lochovsky},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   year         = {1999},
   address      = {Hsinchu, {TW}},
   pages        = {195--202},
   url          = {http://dlib.computer.org/conferen/dasfaa/0084/pdf/00840195.pdf},
   abstract     = {In a text categorization model using an artificial neural network 
                   as the text classifier scalability is poor if the neural network 
                   is trained using the raw feature space since textural data has a 
                   very high-dimension feature space. We proposed and compared four 
                   dimensionality reduction techniques to reduce the feature space 
                   into an input space of much lower dimension for the neural 
                   network classifier. To test the effectiveness of the proposed 
                   model, experiments were conducted using a subset of the 
                   Reuters-22173 test collection for text categorization. The 
                   results showed that the proposed model was able to achieve high 
                   categorization effectiveness as measured by precision and recall. 
                   Among the four dimensionality reduction techniques proposed, 
                   principal component analysis was found to be the most effective 
                   in reducing the dimensionality of the feature space.},
}
@inProceedings{Lam97,
   author       = {Wai Lam and Kon F. Low and Chao Y. Ho},
   title        = {Using a {B}ayesian Network Induction Approach for Text 
                   Categorization},
   booktitle    = {Proceedings of IJCAI-97, 15th International Joint Conference on 
                   Artificial Intelligence},
   editor       = {Martha E. Pollack},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   year         = {1997},
   address      = {Nagoya, {JP}},
   pages        = {745--750},
   url          = {},
   abstract     = {We investigate Bayesian methods for automatic document 
                   categorization and develop a new approach to this problem. Our 
                   new approach is based on a Bayesian network induction which does 
                   not rely on some major assumptions found in a previous method 
                   using the Bayesian independence classifier approach. The design 
                   of the new approach as well as its justification are presented. 
                   Experiments were conducted using a large scale document 
                   collection from Reuters news articles. The results show that our 
                   approach outperformed the Bayesian independence classifier as 
                   measured by a metric that combines precision and recall measures.},
}
@inProceedings{Lam98,
   author       = {Wai Lam and Chao Y. Ho},
   title        = {Using a generalized instance set for automatic text 
                   categorization},
   booktitle    = {Proceedings of SIGIR-98, 21st ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {W. Bruce Croft and Alistair Moffat and Cornelis J. Van Rijsbergen 
                   and Ross Wilkinson and Justin Zobel},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1998},
   address      = {Melbourne, {AU}},
   pages        = {81--89},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/290941/p81-lam/p81-lam.pdf},
   abstract     = {We investigate several recent approaches for text categorization 
                   under the framework of similarity-based learning. They include 
                   two families of text categorization techniques, namely the 
                   k-nearest neighbor (k-NN) algorithm and linear classifiers. After 
                   identifying the weakness and strength of each technique, we 
                   propose a new technique known as the generalized instance set 
                   (GIS) algorithm by unifying the strengths of LNN and linear 
                   classifiers and adapting to characteristics of text 
                   categorization problems. We also explore some variants of our GIS 
                   approach. We have implemented our GIS algorithm, the ExpNet 
                   algorithm, and some linear classifiers. Extensive experiments 
                   have been conducted on two common document corpora, namely the 
                   OHSUMED collection and the Reuters-21578 collection. The results 
                   show that our new approach outperforms the latest LNN approach 
                   and linear classifiers in all experiments.},
}
@article{Lam99a,
   author       = {Lam, Wai and Ruiz, Miguel E. and Srinivasan, Padmini},
   title        = {Automatic text categorization and its applications to text 
                   retrieval},
   journal      = {{IEEE} Transactions on Knowledge and Data Engineering},
   year         = {1999},
   number       = {6},
   volume       = {11},
   pages        = {865--879},
   url          = {http://www.cs.uiowa.edu/~mruiz/papers/IEEE-TKDE.ps},
   abstract     = {We develop an automatic text categorization approach and 
                   investigate its application to text retrieval. The categorization 
                   approach is derived from a combination of a learning paradigm 
                   known as instance-based learning and an advanced document 
                   retrieval technique known as retrieval feedback. We demonstrate 
                   the effectiveness of our categorization approach using two 
                   real-world document collections from the MEDLINE database. Next, 
                   we investigate the application of automatic categorization to 
                   text retrieval. Our experiments clearly indicate that automatic 
                   categorization improves the retrieval performance compared with 
                   no categorization. We also demonstrate that the retrieval 
                   performance using automatic categorization achieves the same 
                   retrieval quality as the performance using manual categorization. 
                   Furthermore, detailed analysis of the retrieval performance on 
                   each individual test query is provided.},
}
@inProceedings{Lam01,
   author       = {Wai Lam and Kwok-Yin Lai},
   title        = {A Meta-Learning Approach for Text Categorization},
   booktitle    = {Proceedings of SIGIR-01, 24th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {W. Bruce Croft and David J. Harper and Donald H. Kraft and Justin 
                   Zobel},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {New Orleans, {US}},
   year         = {2001},
   pages        = {303--309},
   url          = {http://portal.acm.org/citation.cfm?doid=383952.384011},
   abstract     = {We investigate a meta-model approach, called Meta-learning Using 
                   Document Feature characteristics (MUDOF), for the task of 
                   automatic textual document categorization. It employs a 
                   meta-learning phase using document feature characteristics. 
                   Document feature characteristics, derived from the training 
                   document set, capture some inherent category-specific properties 
                   of a particular category. Different from existing categorization 
                   methods, MUDOF can automatically recommend a suitable algorithm 
                   for each category based on the category-specific statistical 
                   characteristics. Hence, different algorithms may be employed for 
                   different categories. Experiments have been conducted on a 
                   real-world document collection demonstrating the effectiveness of 
                   our approach. The results confirm that our meta-model approach 
                   can exploit the advantage of its component algorithms, and 
                   demonstrate a better performance than existing algorithms.},
}
@inProceedings{Lang95,
   author       = {Ken Lang},
   title        = {{\sc NewsWeeder}: learning to filter netnews},
   booktitle    = {Proceedings of ICML-95, 12th International Conference on Machine 
                   Learning},
   editor       = {Armand Prieditis and Stuart J. Russell},
   address      = {Lake Tahoe, {US}},
   pages        = {331--339},
   year         = {1995},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {},
   abstract     = {},
}
@inProceedings{Lanquillon00,
   author       = {Carsten Lanquillon},
   title        = {Learning from Labeled and Unlabeled Documents: A Comparative 
                   Study on Semi-Supervised Text Classification},
   booktitle    = {Proceedings of PKDD-00, 4th European Conference on Principles of 
                   Data Mining and Knowledge Discovery},
   editor       = {Djamel A. Zighed and Henryk Jan Komorowski and Jan M. Zytkow},
   address      = {Lyon, {FR}},
   pages        = {490--497},
   year         = {2000},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1910},
   url          = {http://link.springer.de/link/service/series/0558/papers/1910/19100490.pdf},
   abstract     = {Supervised learning algorithms usually require large amounts of 
                   training data to learn reasonably accurate classifiers. Yet, for 
                   many text classification tasks, providing labeled training 
                   documents is expensive, while unlabeled documents are readily 
                   available in large quantities. Learning from both, labeled and 
                   unlabeled documents, in a semi-supervised framework is a 
                   promising approach to reduce the need for labeled training 
                   documents. This paper compares three commonly applied text 
                   classifiers in the light of semi-supervised learning, namely a 
                   linear support vector machine, a similarity-based tfidf and a 
                   Naïve Bayes classifier. Results on a real-world text datasets 
                   show that these learners may substantially benefit from using a 
                   large amount of unlabeled documents in addition to some labeled 
                   documents.},
}
@inProceedings{Larkey96,
   author       = {Leah S. Larkey and W. Bruce Croft},
   title        = {Combining classifiers in text categorization},
   booktitle    = {Proceedings of SIGIR-96, 19th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Hans-Peter Frei and Donna Harman and Peter Sch{\"{a}}uble and 
                   Ross Wilkinson},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1996},
   address      = {Z{\"{u}}rich, {CH}},
   pages        = {289--297},
   url          = {http://cobar.cs.umass.edu/pubfiles/1combo.ps.gz},
   abstract     = {Three different types of classifiers were investigated in the 
                   context of a text categorization problem in the medical domain: 
                   the automatic assignment of ICD9 codes to dictated inpatient 
                   discharge summaries. K-nearest-neighbour, relevance feedback, and 
                   Bayesian independence classifiers were applied individually and 
                   in combination. A combination of different classifiers produced 
                   better results than any single type of classifier. For this 
                   specific medical categorization problem, new query formulation 
                   and weighting methods used in the k-nearest-neighbor classifier 
                   improved performance.},
}
@inProceedings{Larkey98,
   author       = {Leah S. Larkey},
   title        = {Automatic essay grading using text categorization techniques},
   booktitle    = {Proceedings of SIGIR-98, 21st ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {W. Bruce Croft and Alistair Moffat and Cornelis J. Van Rijsbergen 
                   and Ross Wilkinson and Justin Zobel},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1998},
   address      = {Melbourne, {AU}},
   pages        = {90--95},
   url          = {http://cobar.cs.umass.edu/pubfiles/ir-121.ps},
   abstract     = {Several standard text-categorization techniques were applied to 
                   the problem of automated essay grading. Bayesian independence 
                   classifiers and k-nearest-neighbor classifiers were trained to 
                   assign scores to manually-graded essays. These scores were 
                   combined with several other summary text measures using linear 
                   regression. The classifiers and regression equations were then 
                   applied to a new set of essays. The classifiers worked very well. 
                   The agreement between the automated grader and the final manual 
                   grade was as good as the agreement between human graders.},
}
@inProceedings{Larkey99,
   author       = {Leah S. Larkey},
   title        = {A patent search and classification system},
   booktitle    = {Proceedings of DL-99, 4th ACM Conference on Digital Libraries},
   editor       = {Edward A. Fox and Neil Rowe},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1999},
   address      = {Berkeley, {US}},
   pages        = {179--187},
   url          = {http://cobar.cs.umass.edu/pubfiles/ir-162.ps},
   abstract     = {We present a system for searching and classifying U.S. patent 
                   documents, based on Inquery. Patents are distributed through 
                   hundreds of collections, divided up by general area. The system 
                   selects the best collections for the query. Users can search for 
                   pants or classify patent text. The user interface helps users 
                   search in fields without requiring the knowledge of Inquery query 
                   operators. The system includes a unique phrase help facility, 
                   which helps users find and add phrases and terms related to those 
                   in their query.},
}
@inProceedings{Lee02,
   author       = {Yong-Bae Lee and Sung H. Myaeng},
   title        = {Text Genre Classification with Genre-Revealing and 
                   Subject-Revealing Features},
   booktitle    = {Proceedings of SIGIR-02, 25th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Micheline Beaulieu and Ricardo Baeza-Yates and Sung Hyon Myaeng 
                   and Kalervo J{\"{a}}rvelin},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Tampere, {FI}},
   year         = {2002},
   pages        = {145--150},
   url          = {http://doi.acm.org/10.1145/564376.564403},
   abstract     = {Subject or prepositional content has been the focus of most 
                   classification research. Genre or style, on the other hand, is a 
                   different and important property of text, and automatic text 
                   genre classification is becoming important for classification and 
                   retrieval purposes as well as for some natural language 
                   processing research. In this paper, we present a method for 
                   automatic genre classification that is based on statistically 
                   selected features obtained from both subject-classified and genre 
                   classified training data. The experimental results show that the 
                   proposed method outperforms a direct application of a statistical 
                   learner often used for subject classification. We also observe 
                   that the deviation formula and discrimination formula using 
                   document frequency ratios also work as expected. We conjecture 
                   that this dual feature set approach can be generalized to improve 
                   the performance of subject classification as well.},
}
@inProceedings{Lee02a,
   author       = {Michael D. Lee},
   title        = {Fast Text Classification Using Sequential Sampling Processes},
   booktitle    = {Proceedings of the 14th Australian Joint Conference on Artificial 
                   Intelligence},
   editor       = {Markus Stumptner and Dan Corbett and Michael J. Brooks},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Adelaide, {AU}},
   year         = {2002},
   pages        = {309--320},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2256},
   url          = {http://link.springer.de/link/service/series/0558/papers/2256/22560309.pdf},
   abstract     = {A central problem in information retrieval is the automated 
                   classification of text documents. While many existing methods 
                   achieve good levels of performance, they generally require levels 
                   of computation that prevent them from making sufficiently fast 
                   decisions in some applied setting. Using insights gained from 
                   examining the way humans make fast decisions when classifying 
                   text documents, two new text classification algorithms are 
                   developed based on sequential sampling processes. These 
                   algorithms make extremely fast decisions, because they need to 
                   examine only a small number of words in each text document. 
                   Evaluation against the Reuters-21578 collection shows both 
                   techniques have levels of performance that approach benchmark 
                   methods, and the ability of one of the classifiers to produce 
                   realistic measures of confidence in its decisions is shown to be 
                   useful for prioritizing relevant documents.},
}
@inProceedings{Lee02c,
   author       = {Kang Hyuk Lee and Judy Kay and Byeong Ho Kang and Uwe Rosebrock},
   title        = {A Comparative Study on Statistical Machine Learning Algorithms 
                   and Thresholding Strategies for Automatic Text Categorization},
   booktitle    = {Proceedings of PRICAI-02, 7th Pacific Rim International 
                   Conference on Artificial Intelligence},
   editor       = {Mitsuru Ishizuka and Abdul Sattar},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Tokyo, {JP}},
   year         = {2002},
   pages        = {444--453},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2417},
   url          = {http://link.springer.de/link/service/series/0558/papers/2417/24170444.pdf},
   abstract     = {Two main research areas in statistical text categorization are 
                   similarity-based learning algorithms and associated thresholding 
                   strategies. The combination of these techniques significantly 
                   influences the overall performance of text categorization. After 
                   investigating two similarity-based classifiers (k-NN and Rocchio) 
                   and three common thresholding techniques (RCut, PCut, and SCut), 
                   we describe a new learning algorithm known as the keyword 
                   association network (KAN) and a new thresholding strategy 
                   (RinSCut) to improve performance over existing techniques. 
                   Extensive experiments have been conducted on the Reuters-21578 
                   and 20-Newsgroups data sets. The experimental results show that 
                   our new approaches give better results for both micro-averaged F1 
                   and macro-averaged F1 scores.},
}
@article{Lehnert94,
   author       = {Wendy Lehnert and Stephen Soderland and David Aronow and Fangfang 
                   Feng and Avinoam Shmueli},
   title        = {Inductive text classification for medical applications},
   journal      = {Journal of Experimental and Theoretical Artificial Intelligence},
   year         = {1994},
   number       = {1},
   volume       = {7},
   pages        = {49--80},
   url          = {},
   abstract     = {},
}
@article{Leopold02,
   author       = {Leopold, Edda and Kindermann, J{\"{o}}rg},
   title        = {Text Categorization with Support Vector Machines: How to 
                   Represent Texts in Input Space?},
   journal      = {Machine Learning},
   year         = {2002},
   volume       = {46},
   number       = {1/3},
   pages        = {423--444},
   url          = {http://www.wkap.nl/article.pdf?380516},
   abstract     = {The choice of the kernel function is crucial to most applications 
                   of support vector machines. In this paper, however, we show that 
                   in the case of text classification, term-frequency 
                   transformations have a larger impact on the performance of SVM 
                   than the kernel itself. We discuss the role of importance-weights 
                   (e.g. document frequency and redundancy), which is not yet fully 
                   understood in the light of model complexity and calculation cost, 
                   and we show that time consuming lemmatization or stemming can be 
                   avoided even when classifying a highly inflectional language like 
                   German.},
}
@article{Leung97,
   author       = {Chi-Hong Leung and Wing-Kay Kan},
   title        = {A Statistical Learning Approach to Automatic Indexing of 
                   Controlled Index Terms},
   journal      = {Journal of the American Society for Information Science},
   year         = {1997},
   number       = {1},
   pages        = {55--67},
   volume       = {48},
   url          = {http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=39602&PLACEBO=IE.pdf},
   abstract     = {A statistical learning approach to assigning controlled index 
                   terms is presented. In this approach, there are two processes: 
                   (1) The learning process and (2) the indexing process. The 
                   learning process constructs a relationship between an index term 
                   and the words relevant and irrelevant to it, based on the 
                   positive training set and negative training set, which are sample 
                   documents indexed by the index term, and those not indexed by it, 
                   respectively. The indexing process determines whether an index 
                   term is assigned to a certain document, based on the relationship 
                   constructed by the learning process, and the text found in the 
                   document. Furthermore, a learning feedback technique is 
                   introduced. This technique used in the learning process modifies 
                   the relationship between an index term and its relevant and 
                   irrelevant words to improve the learning performance and, thus, 
                   the indexing performance. Experimental results have shown that 
                   the statistical learning approach and the learning feedback 
                   technique are practical means to automatic indexing of controlled 
                   index terms.},
}
@inProceedings{Lewis91,
   author       = {Lewis, David D.},
   title        = {Data extraction as text categorization: An experiment with the 
                   {MUC-3} corpus.},
   booktitle    = {Proceedings of MUC-3, 3rd Message Understanding Conference},
   editor       = {},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   address      = {San Diego, {US}},
   pages        = {245--255},
   year         = {1991},
   url          = {http://www.research.att.com/~lewis/papers/lewis91c.ps},
   abstract     = {[no abstract]},
}
@inProceedings{Lewis92,
   author       = {Lewis, David D.},
   title        = {An evaluation of phrasal and clustered representations on a text 
                   categorization task},
   booktitle    = {Proceedings of SIGIR-92, 15th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Nicholas J. Belkin and Peter Ingwersen and Annelise Mark 
                   Pejtersen},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Kobenhavn, {DK}},
   pages        = {37--50},
   year         = {1992},
   url          = {http://www.research.att.com/~lewis/papers/lewis92b.ps},
   abstract     = {Syntactic phrase indexing and term clustering have been widely 
                   explored as text representation techniques for text retrieval. In 
                   this paper, we study the properties of phrasal and clustered 
                   indexing languages on a text categorization task, enabling us to 
                   study their properties in isolation from query interpretation 
                   issues. We show that optimal effectiveness occurs when using only 
                   a small proportion of the indexing terms available, and that 
                   effectiveness peaks at a higher feature set size and lower 
                   effectiveness level for a syntactic phrase indexing than for 
                   word-based indexing. We also present results suggesting that 
                   traditional term clustering methods are unlikely to provide 
                   significantly improved text representations. An improved 
                   probabilistic text categorization method is also presented.},
}
@phdThesis{Lewis92a,
   author       = {Lewis, David D.},
   title        = {Representation and learning in information retrieval},
   school       = {Department of Computer Science, University of Massachusetts},
   address      = {Amherst, {US}},
   year         = {1992},
   url          = {http://www.research.att.com/~lewis/papers/lewis91d.ps},
   abstract     = {This dissertation introduces a new theoretical model for text 
                   classification systems, including systems for document retrieval, 
                   automated indexing, electronic mail filtering, and similar tasks. 
                   The Concept Learning model emphasizes the role manual and 
                   automated feature selection and classifier formation in text 
                   classification. It enables drawing on results from statistics and 
                   machine learning in explaining the effectiveness of alternate 
                   representations of text, and specifies desirable characteristics 
                   of text representations. The use of syntactic parsing to produce 
                   indexing phrases has been widely investigated as a possible route 
                   to better text representations. Experiments with syntactic phrase 
                   indexing, however, have never yielded significant improvements in 
                   text retrieval performance. The Concept Learning model suggests 
                   that the poor statistical characteristics of a syntactic indexing 
                   phrase representation negate its dsirable semantic 
                   characteristics. The application of term clustering to this 
                   representation to improve its statistical properties while 
                   retaining its desirable meaning properties is proposed. Standard 
                   term clustering strategies from information retrieval (IR), based 
                   on cooccurence of indexing terms in documents or groups of 
                   documents, were tested on a syntactic indexing phrase 
                   representation. In experiments using a standard text retrieval 
                   test collection, small effectiveness improvements were obtained. 
                   As a means of evaluating representation quality, a text retrieval 
                   test collection introduces a number of confounding factors. In 
                   contrast, the text categorization task allows much cleaner 
                   determination of text representation properties. In preparation 
                   for the use of text categorization to study text representation, 
                   a more effective and theoretically well-founded probablistic text 
                   categorization algorithm was developed, building on work by 
                   Maron, Fuhr, and others. Text categorization experiments 
                   supported a number of predictions of the Concept Learning model 
                   about properties of phrasal representations, including 
                   dimensionality properties not previously measured for text 
                   representations. However, in carefully controlled experiments 
                   using syntactic phrases produced by Church's stochastic 
                   bracketer, in conjunction with reciprocal nearest neighbor 
                   clustering, term clustering was found to produce essentially no 
                   improvement in the properties of the phrasal representation. New 
                   cluster analysis approaches are proposed to remedy the problems 
                   found in traditional term clustering methods.},
}
@inProceedings{Lewis94c,
   author       = {Lewis, David D. and Jason Catlett},
   title        = {Heterogeneous uncertainty sampling for supervised learning},
   booktitle    = {Proceedings of ICML-94, 11th International Conference on Machine 
                   Learning},
   editor       = {William W. Cohen and Haym Hirsh},
   year         = {1994},
   address      = {New Brunswick, {US}},
   pages        = {148--156},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www.research.att.com/~lewis/papers/lewis94e.ps},
   abstract     = {Uncertainty sampling methods iteratively request class labels for 
                   training instances whose classes are uncertain despite the 
                   previous labeled instances. These methods can greatly reduce the 
                   number of instances that an expert need label. One problem with 
                   this approach is that the classifier best suited for an 
                   application may be too expensive to train or use during the 
                   selection of instances. We test the use of one classifier (a 
                   highly efficient probabilistic one) to select examples for 
                   training another (the C4.5 rule induction program). Despite being 
                   chosen by this heterogeneous approach, the uncertainty samples 
                   yielded classifiers with lower error rates than random samples 
                   ten times larger.},
}
@inProceedings{Lewis94a,
   author       = {Lewis, David D. and Gale, William A.},
   title        = {A sequential algorithm for training text classifiers},
   booktitle    = {Proceedings of SIGIR-94, 17th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {W. Bruce Croft and Cornelis J. Van Rijsbergen},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   year         = {1994},
   address      = {Dublin, {IE}},
   pages        = {3--12},
   note         = {See also~\cite{Lewis95a}},
   url          = {http://www.research.att.com/~lewis/papers/lewis94c.ps},
   abstract     = {The ability to cheaply train text classifiers is critical to 
                   their use in information retrieval, content analysis, natural 
                   language processing, and other tasks involving data which is 
                   partly or fully textual. An algorithm for sequential sampling 
                   during machine learning of statistical classifiers was developed 
                   and tested on a newswire text categorization task. This method, 
                   which we call uncertainty sampling, reduced by as much as 
                   500-fold the amount of training data that would have to be 
                   manually classified to achieve a given level of effectiveness.},
}
@article{Lewis94b,
   author       = {Lewis, David D. and Philip J. Hayes},
   title        = {Guest editors' introduction to the special issue on text 
                   categorization},
   journal      = {{ACM} Transactions on Information Systems},
   volume       = {12},
   number       = {3},
   pages        = {231},
   year         = {1994},
}
@inProceedings{Lewis94,
   author       = {Lewis, David D. and Marc Ringuette},
   title        = {A comparison of two learning algorithms for text categorization},
   booktitle    = {Proceedings of SDAIR-94, 3rd Annual Symposium on Document 
                   Analysis and Information Retrieval},
   publisher    = {},
   editor       = {},
   year         = {1994},
   address      = {Las Vegas, {US}},
   pages        = {81--93},
   url          = {http://www.research.att.com/~lewis/papers/lewis94b.ps},
   abstract     = {This paper examines the use of inductive learning to categorize 
                   natural language documents into predefined content categories. 
                   Categorization of text is of increasing importance in information 
                   retrieval and natural language processing systems. Previous 
                   research on automated text categorization has mixed machine 
                   learning and knowledge engineering methods, making it difficult 
                   to draw conclusions about the performance of particular methods. 
                   In this paper we present empirical results on the performance of 
                   a Bayesian classifier and a decision tree learning algorithm on 
                   two text categorization data sets. We find that both algorithms 
                   achieve reasonable performance and allow controlled tradeoffs 
                   between false positives and false negatives. The stepwise feature 
                   selection in the decision tree algorithm is particularly 
                   effective in dealing with the large feature sets common in text 
                   categorization. However, even this algorithm is aided by an 
                   initial prefiltering of features, confirming the results found by 
                   Almuallim and Dietterich on artificial data sets. We also 
                   demonstrate the impact of the time-varying nature of category 
                   definitions.},
}
@inProceedings{Lewis95,
   author       = {Lewis, David D.},
   title        = {Evaluating and optmizing autonomous text classification systems},
   booktitle    = {Proceedings of SIGIR-95, 18th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Edward A. Fox and Peter Ingwersen and Raya Fidel},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1995},
   address      = {Seattle, {US}},
   pages        = {246--254},
   url          = {http://www.research.att.com/~lewis/papers/lewis95b.ps},
   abstract     = {Text retrieval systems typically produce a ranking of documents 
                   and let a user decide how far down that ranking to go. In 
                   contrast, programs that filter text streams, software that 
                   categorizes documents, agents which alert users, and many other 
                   IR systems must make decisions without human input or 
                   supervision. It is important to define what constitutes good 
                   effectiveness for these autonomous systems, tune the systems to 
                   achieve the highest possible effectiveness, and estimate how the 
                   effectiveness changes as new data is processed. We show how to do 
                   this for binary text classification systems, emphasizing that 
                   different goals for the system lead to different optimal 
                   behaviors. Optimizing and estimating effectiveness is greatly 
                   aided if classifiers that explicitly estimate the probability of 
                   class membership are used.},
}
@article{Lewis95a,
   author       = {Lewis, David D.},
   title        = {A sequential algorithm for training text classifiers: corrigendum 
                   and additional data},
   journal      = {{SIGIR} Forum},
   year         = {1995},
   pages        = {13--19},
   volume       = {29},
   number       = {2},
   url          = {http://www.research.att.com/~lewis/papers/lewis95g.ps},
   abstract     = {Previously I compared the effectiveness of uncertainty sampling 
                   with that of random sampling and relevance sampling in choosing 
                   training data for a text categorization data set (Lewis and Gale, 
                   1994). (Relevance sampling is the application of relevance 
                   feedback to producing a training sample.) I have discovered a bug 
                   in my experimental software which caused the relevance sampling 
                   results reported in the paper to be incorrect. (The uncertainty 
                   sampling and random sampling results in that paper were correct.) 
                   I have since fixed the bug and rerun the experiments. This note 
                   presents the corrected results, along with additional data 
                   supporting the original claim that uncertainty sampling has an 
                   advantage over relevance sampling in most training situations.},
}
@inProceedings{Lewis95b,
   author       = {David D. Lewis},
   title        = {The {TREC-4} filtering track: description and analysis},
   booktitle    = {Proceedings of TREC-4, 4th Text Retrieval Conference},
   publisher    = {National Institute of Standards and Technology, Gaithersburg, {US}},
   editor       = {Donna K. Harman and Ellen M. Voorhees},
   year         = {1995},
   address      = {Gaithersburg, {US}},
   pages        = {165--180},
   url          = {http://www.research.att.com/~lewis/papers/lewis96b.ps},
   abstract     = {The TREC-4 (4th Text REtrieval Conference) filtering track was an 
                   experiment in the evaluation of binary text classification 
                   systems. In contrast to ranking systems, binary text 
                   classification systems may need to produce result sets of any 
                   size, requiring that sampling be used to estimate their 
                   effectiveness. We present an effectiveness measure based on 
                   utility, and two sampling strategies (pooling and stratified 
                   sampling) for estimating the utility of the submitted sets. An 
                   evaluation of four sites was successfully carried out using this 
                   approach.},
}
@inProceedings{Lewis96,
   author       = {Lewis, David D. and Robert E. Schapire and James P. Callan and 
                   Ron Papka},
   title        = {Training algorithms for linear text classifiers},
   booktitle    = {Proceedings of SIGIR-96, 19th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Hans-Peter Frei and Donna Harman and Peter Sch{\"{a}}uble and 
                   Ross Wilkinson},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1996},
   address      = {Z{\"{u}}rich, {CH}},
   pages        = {298--306},
   url          = {http://www.research.att.com/~lewis/papers/lewis96d.ps},
   abstract     = {Systems for text retrieval, routing, categorization and other IR 
                   tasks rely heavily on linear classifiers. We propose that two 
                   machine learning algorithms, the Widrow-Hoff and EG algorithms, 
                   be used in training linear text classifiers. In contrast to most 
                   IR methods, theoretical analysis provides performance guarantees 
                   and guidance on parameter settings for these algorithms. 
                   Experimental data is presented showing Widrow-Hoff and EG to be 
                   more effective than the widely used Rocchio algorithm on several 
                   categorization and routing tasks.},
}
@misc{Lewis97a,
   author       = {Lewis, David D.},
   title        = {Reuters-21578 text categorization test collection. {D}istribution 
                   1.0},
   year         = {1997},
   note         = {Available as {\tt 
                   http://www.research.att.com/\~{}lewis/reuters21578/README.txt}},
   url          = {http://www.research.att.com/~lewis/reuters21578/README.txt},
   abstract     = {[no abstract]},
}
@inProceedings{Lewis98,
   author       = {Lewis, David D.},
   title        = {Naive ({B}ayes) at forty: The independence assumption in 
                   information retrieval.},
   booktitle    = {Proceedings of ECML-98, 10th European Conference on Machine 
                   Learning},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1398},
   editor       = {Claire N{\'{e}}dellec and C{\'{e}}line Rouveirol},
   address      = {Chemnitz, {DE}},
   pages        = {4--15},
   year         = {1998},
   url          = {http://www.research.att.com/~lewis/papers/lewis98b.ps},
   abstract     = {The naive Bayes classifier, currently experiencing a renaissance 
                   in machine learning, has long been a core technique in 
                   information retrieval. We review some of the variations of naive 
                   Bayes models used for text retrieval and classification, focusing 
                   on the distributional assumptions made about word occurrences in 
                   documents.},
}
@inProceedings{Lewis99,
   author       = {Lewis, David D. and Daniel L. Stern and Amit Singhal},
   title        = {{\sc Attics}: a software platform for on-line text classification},
   booktitle    = {Proceedings of SIGIR-99, 22nd ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Marti A. Hearst and Fredric Gey and Richard Tong},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Berkeley, {US}},
   year         = {1999},
   pages        = {267--268},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/312624/p267-lewis/p267-lewis.pdf},
   abstract     = {Numerous systems for ranked retrieval on text databases have been 
                   implemented by both information retrieval researchers and in the 
                   commercial sector. In contrast, software for text categorization, 
                   message filtering, textual data mining, and related tasks is less 
                   common. ATTICS is an extensible text classification system we 
                   have implemented in C++. It supports incremental training and 
                   online application of classifiers and predictive models to 
                   streams of textual, numeric, symbolic, and hybrid data records. 
                   An object-oriented design allows easy addition of new 
                   preprocessors, machine learning algorithms, and classifier types.},
}
@inProceedings{Lewis00,
   author       = {Lewis, David D.},
   title        = {Machine learning for text categorization: background and 
                   characteristics},
   booktitle    = {Proceedings of the 21st Annual National Online Meeting},
   editor       = {Williams, Martha E.},
   publisher    = {Information Today, Medford, {USA}},
   address      = {New York, {US}},
   year         = {2000},
   pages        = {221--226},
   url          = {},
   abstract     = {Text categorization is of increasing interest in both controlled 
                   vocabulary indexing and other applications. Machine learning 
                   methods for automatically producing categorization rules have 
                   similarly seen increased attention, as a way to reduce the cost 
                   of fielding categorization systems. While the experimental 
                   literature on text categorization emphasizes effectiveness 
                   comparisons, we list a variety of other characteristics of 
                   learning approaches that are equally important to consider. 
                   Research on machine learning for text categorization, already 
                   advancing at a rapid pace, could be further accelerated if better 
                   test collections were available.},
}
@article{Lewis03,
   author       = {Lewis, David D. and Fan Li and Tony Rose and Yiming Yang},
   title        = {{Reuters Corpus Volume I} as a Text categorization test 
                   collection},
   journal      = {Journal of Machine Learning Research},
   volume       = {},
   month        = {},
   pages        = {},
   year         = {2003},
   url          = {},
   abstract     = {},
   note         = {Forthcoming},
}
@inProceedings{Li97,
   author       = {Hang Li and Kenji Yamanishi},
   title        = {Document classification using a finite mixture model},
   booktitle    = {Proceedings of ACL-97, 35th Annual Meeting of the Association for 
                   Computational Linguistics},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   editor       = {Philip R. Cohen and Wolfgang Wahlster},
   year         = {1997},
   address      = {Madrid, {ES}},
   pages        = {39--47},
   url          = {http://xxx.lanl.gov/ps/cmp-lg/9705005},
   abstract     = {We propose a new method of classifying documents into categories. 
                   The simple method of conducting hypothesis testing over 
                   word-based distributions in categories suffers from the data 
                   sparseness problem. In order to address this difficulty, Guthrie 
                   et.al. have developed a method using distributions based on hard 
                   clustering of words, i.e., in which a word is assigned to a 
                   single cluster and words in the same cluster are treated 
                   uniformly. This method might, however, degrade classification 
                   results, since the distributions it employs are not always 
                   precise enough for representing the differences between 
                   categories. We propose here the use of soft clustering of words, 
                   i.e., in which a word can be assigned to several different 
                   clusters and each cluster is characterized by a specific word 
                   probability distribution. We define for each document category a 
                   finite mixture model, which is a linear combination of the 
                   probability distributions of the clusters. We thereby treat the 
                   problem of classifying documents as that of conducting 
                   statistical hypothesis testing over finite mixture models. In 
                   order to accomplish this testing, we employ the EM algorithm 
                   which helps efficiently estimate parameters in a finite mixture 
                   model. Experimental results indicate that our method outperforms 
                   not only the method using distributions based on hard clustering, 
                   but also the method using word-based distributions and the method 
                   based on cosine-similarity.},
}
@inProceedings{Li99,
   author       = {Hang Li and Kenji Yamanishi},
   title        = {Text classification using {ESC}-based stochastic decision lists},
   booktitle    = {Proceedings of CIKM-99, 8th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {},
   year         = {1999},
   address      = {Kansas City, {US}},
   pages        = {122--130},
   url          = {http://www.acm.org/pubs/articles/proceedings/cikm/319950/p122-li/p122-li.pdf},
   abstract     = {We propose a new method of text classification using stochastic 
                   decision lists. A stochastic decision list is an ordered sequence 
                   of IF-THEN rules, and our method can be viewed as a rule-based 
                   method for text clsssification having advantages of readability 
                   and refinability of acquired knowledge. Our method is unique in 
                   that decision lists are automatically constructed on the basis of 
                   the principle of minimizing Extended Stochastic Complexity (ESC), 
                   and with it we are able to construct decision lists that have 
                   fewer errors in classification. The accuracy of classification 
                   achieved with our method appears better than or comparable to 
                   those of existing rule-based methods.},
}
@article{Li02,
   author       = {Hang Li and Kenji Yamanishi},
   title        = {Text classification using {ESC}-based stochastic decision lists},
   journal      = {Information Processing and Management},
   pages        = {343--361},
   year         = {2002},
   number       = {3},
   volume       = {38},
   url          = {},
   abstract     = {We propose a new method of text classification using stochastic 
                   decision lists. A stochastic decision list is an ordered sequence 
                   of IF-THEN-ELSE rules, and our method can be viewed as a 
                   rule-based method for text classification having advantages of 
                   readability and refinability of acquired knowledge. Our method is 
                   unique in that decision lists are automatically constructed on 
                   the basis of the principle of minimizing extended stochastic 
                   complexity (ESC), and with it we are able to construct decision 
                   lists that have fewer errors in classification. The accuracy of 
                   classification achieved with our method appears better than or 
                   comparable to those of existing rule-based methods. We have 
                   empirically demonstrated that rule-based methods like ours result 
                   in high classification accuracy when the categories to which 
                   texts are to be assigned are relatively specific ones and when 
                   the texts tend to be short. We have also empirically verified the 
                   advantages of rule-based methods over non-rule-based ones.},
}
@inProceedings{Li02a,
   author       = {Xin Li and Dan Roth},
   title        = {Learning question classifiers},
   booktitle    = {Proceedings of COLING-02, 19th International Conference on 
                   Computational Linguistics},
   editor       = {},
   publisher    = {},
   address      = {Taipei, {TW}},
   url          = {http://l2r.cs.uiuc.edu/~danr/Papers/qc-coling02.pdf},
   year         = {2002},
   abstract     = {In order to respond correctly to a free form factual question 
                   given a large collection of texts, one needs to understand the 
                   question to a level that allows determining some of the 
                   constraints the question imposes on a possible answer. These 
                   constraints may include a semantic classification of the sought 
                   after answer and may even suggest using different strategies when 
                   looking for and verifying a candidate answer. This paper presents 
                   a machine learning approach to question classification. We learn 
                   a hierarchical classi- fier that is guided by a layered semantic 
                   hierarchy of answer types, and eventually classifies questions 
                   into finegrained classes. We show accurate results on a large 
                   collection of free-form questions used in TREC 10.},
}
@inProceedings{Li91,
   author       = {Wei Li and B. Lee and F. Krausz and K. Sahin},
   title        = {Text classification by a neural network},
   booktitle    = {Proceedings of the 23rd Annual Summer Computer Simulation 
                   Conference},
   editor       = {},
   publisher    = {},
   address      = {Baltimore, {US}},
   pages        = {313--318},
   year         = {1991},
   url          = {},
   abstract     = {When banks process their free-form telex traffic, the first task 
                   is the classification of the telexes. Historically, several 
                   attempts have been made to automate this process, using various 
                   stock phrases as the features on which to base the 
                   classification. This is a problem in which there are large 
                   amounts of data available, but the rules for classification are 
                   not explicitly available. For solving these kinds of problems, 
                   neural networks have the advantage of extracting the underlying 
                   relationships between the input data and the output classes 
                   automatically. Based on this consideration, the authors have 
                   built a neural network classification system, which has three 
                   subsystems: a user-maintainable feature definition subsystem, a 
                   feature extraction subsystem, and a neural network subsystem. The 
                   neural network is simulated on a VAX computer with a fast 
                   learning algorithm, and is combined with some non-statistical 
                   knowledge from the feature definition system. Above 90\% correct 
                   recognition rates have been achieved for the major categories 
                   concerned. The system is also applicable to text classification 
                   problems other than telex classification.},
}
@article{Li98a,
   author       = {Li, Yong H. and Jain, Anil K.},
   title        = {Classification of text documents},
   journal      = {The Computer Journal},
   year         = {1998},
   volume       = {41},
   number       = {8},
   pages        = {537--546},
   url          = {},
   abstract     = {The exponential growth of the Internet has led to a great deal of 
                   interest in developing useful and efficient tools and software to 
                   assist users in searching the Web. Document retrieval, 
                   categorization, routing and filtering can all be formulated as 
                   classification problems. However, the complexity of natural 
                   languages and the extremely high dimensionality of the feature 
                   space of documents have made this classification problem very 
                   difficult. We investigate four different methods for document 
                   classification: the naive Bayes classifier, the nearest neighbour 
                   classifier, decision trees and a subspace method. These were 
                   applied to seven-class Yahoo news groups (business, 
                   entertainment, health, international, politics, sports and 
                   technology) individually and in combination, We studied three 
                   classifier combination approaches: simple voting, dynamic 
                   classifier selection and adaptive classifier combination. Our 
                   experimental results indicate that the naive Bayes classifier and 
                   the subspace method outperform the other two classifiers on our 
                   data sets. Combinations of multiple classifiers did not always 
                   improve the classification accuracy compared to the best 
                   individual classifier. Among the three different combination 
                   approaches, our adaptive classifier combination method introduced 
                   here performed the best.},
}
@inProceedings{Li03,
   author       = {Cong Li and Ji-Rong Wen and Hang Li},
   title        = {Text Classification Using Stochastic Keyword Generation},
   booktitle    = {Proceedings of ICML-03, 20th International Conference on Machine 
                   Learning},
   editor       = {},
   year         = {2003},
   address      = {Washington, {DC}},
   pages        = {},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {},
   abstract     = {},
}
@inProceedings{Li03a,
   author       = {Fan Li and Yiming Yang},
   title        = {A Loss Function Analysis for Classification Methods in Text 
                   Categorization},
   booktitle    = {Proceedings of ICML-03, 20th International Conference on Machine 
                   Learning},
   editor       = {},
   year         = {2003},
   address      = {Washington, {DC}},
   pages        = {},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {},
   abstract     = {},
}
@inProceedings{Liao02,
   author       = {Yihua Liao and V. Rao Vemuri},
   title        = {Using Text Categorization Techniques for Intrusion Detection},
   booktitle    = {Proceedings of the 11th USENIX Security Symposium},
   publisher    = {},
   editor       = {Dan Boneh},
   year         = {2002},
   address      = {San Francisco, {US}},
   pages        = {51--59},
   url          = {http://www.usenix.org/publications/library/proceedings/sec02/liao.html},
   abstract     = {A new approach, based on the k-Nearest Neighbor (kNN) classifier, 
                   is used to classify program behavior as normal or intrusive. 
                   Short sequences of system calls have been used by others to 
                   characterize a program's normal behavior before. However, 
                   separate databases of short system call sequences have to be 
                   built for different programs, and learning program profiles 
                   involves time-consuming training and testing processes. With the 
                   kNN classifier, the frequencies of system calls are used to 
                   describe the program behavior. Text categorization techniques are 
                   adopted to convert each process to a vector and calculate the 
                   similarity between two program activities. Since there is no need 
                   to learn individual program profiles separately, the calculation 
                   involved is largely reduced. Preliminary experiments with 1998 
                   DARPA BSM audit data show that the kNN classifier can effectively 
                   detect intrusive attacks and achieve a low false positive rate.},
}
@article{Liddy94,
   author       = {Elizabeth D. Liddy and Woojin Paik and Edmund S. Yu},
   title        = {Text categorization for multiple users based on semantic features 
                   from a machine-readable dictionary},
   journal      = {{ACM} Transactions on Information Systems},
   year         = {1994},
   number       = {3},
   volume       = {12},
   pages        = {278--295},
   url          = {http://www.acm.org/pubs/articles/journals/tois/1994-12-3/p278-liddy/p278-liddy.pdf},
   abstract     = {The text categorization module described in the paper provides a 
                   front-end filtering function for the larger DR-LINK text 
                   retrieval system (Liddy and Myaeng 1993). The module evaluates a 
                   large incoming stream of documents to determine which documents 
                   are sufficiently similar to a profile at the broad subject level 
                   to warrant more refined representation and matching. To 
                   accomplish this task, each substantive word in a text is first 
                   categorized using a feature set based on the semantic subject 
                   field codes (SFCs) assigned to individual word senses in a 
                   machine-readable dictionary. When tested on 50 user profiles and 
                   550 megabytes of documents, results indicate that the feature set 
                   that is the basis of the text categorization module and the 
                   algorithm that establishes the boundary of categories of 
                   potentially relevant documents accomplish their tasks with a high 
                   level of performance. This means that the category of potentially 
                   relevant documents for most profiles would contain at least 80\% 
                   of all documents later determined to be relevant to the profile. 
                   The number of documents in this set would be uniquely determined 
                   by the system's category-boundary predictor, and this set is 
                   likely to contain less than 5\% of the incoming stream of 
                   documents.},
}
@inProceedings{Liere97,
   author       = {Ray Liere and Prasad Tadepalli},
   title        = {Active learning with committees for text categorization},
   booktitle    = {Proceedings of AAAI-97, 14th Conference of the American 
                   Association for Artificial Intelligence},
   editor       = {},
   publisher    = {{AAAI} Press, Menlo Park, {US}},
   year         = {1997},
   pages        = {591--596},
   address      = {Providence, {US}},
   url          = {http://www.rdrop.com/~lierer/aaai97.ps},
   abstract     = {In many real-world domains, supervised learning requires a large 
                   number of training examples. In this paper, we describe an active 
                   learning method that uses a committee of learners to reduce the 
                   number of training examples required for learning. Our approach 
                   is similar to the Query by Committee framework, where 
                   disagreement among the committee members on the predicted label 
                   for the input part of the example is used to signal the need for 
                   knowing the actual value of the label. Our experiments are 
                   conducted in the text categorization domain, which is 
                   characterized by a large number of features, many of which are 
                   irrelevant. We report here on experiments using a committee of 
                   Winnow-based learners and demonstrate that this approach can 
                   reduce the number of labeled training examples required over that 
                   used by a single Winnow learner by 1-2 orders of magnitude.},
}
@inProceedings{Liere98,
   author       = {Ray Liere and Prasad Tadepalli},
   title        = {Active Learning with Committees: Preliminary Results in Comparing 
                   {W}innow and {P}erceptron in Text Categorization},
   booktitle    = {Proceedings of CONALD-98, 1st Conference on Automated Learning 
                   and Discovery},
   editor       = {},
   publisher    = {{AAAI} Press, Menlo Park, {US}},
   year         = {1998},
   pages        = {},
   address      = {Pittsburgh, {US}},
   url          = {http://www.rdrop.com/~lierer/conald98.ps},
   abstract     = {The availability of vast amounts of information on the World Wide 
                   Web has created a big demand for automatic tools to organize and 
                   index that information. Unfortunately, the paradigm of supervised 
                   machine learning is ill-suited to this task, as it assumes that 
                   the training examples are classified by a teacher ­ usually a 
                   human. In this paper, we describe an active learning method based 
                   on Query by Committee (QBC) that reduces the number of labeled 
                   training examples (text documents) required for learning by 1-2 
                   orders of magnitude.},
}
@inProceedings{Lim99,
   author       = {Lim, Joo Hwee},
   title        = {Learnable visual keywords for image classification},
   booktitle    = {Proceedings of DL-99, 4th ACM Conference on Digital Libraries},
   editor       = {Edward A. Fox and Neil Rowe},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1999},
   address      = {Berkeley, {US}},
   pages        = {139--145},
   url          = {http://www.acm.org/pubs/articles/proceedings/dl/313238/p139-lim/p139-lim.pdf},
   abstract     = {Automatic categorization of multimedia documents is an important 
                   function for a digital library system. While text categorization 
                   has received much attentions by IR researchers, classification of 
                   visual data is at its infancy stage. In this paper, we propose a 
                   notion of visual keywords for similarity matching between visual 
                   contents. Visual keywords can be constructed automatically from 
                   samples of visual data through supervised/unsupervised learning. 
                   Given a visual content, the occurrences of visual keywords are 
                   detected, summarized spatially, and coded via singular value 
                   decomposition to arrive at a concise coded description. The 
                   methods to create, detect, summarize, select, and code visual 
                   keywords will be detailed. Last but not least, we describe an 
                   evaluation experiment that classifies professional nature scenery 
                   photographs to demonstrate the effectiveness and efficiency of 
                   visual keywords for automatic categorization of images in digital 
                   libraries.},
}
@inProceedings{Liu02,
   author       = {Yan Liu and Yiming Yang and Jaime Carbonell},
   title        = {Boosting to Correct the Inductive Bias for Text Classification},
   booktitle    = {Proceedings of CIKM-02, 11th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {},
   year         = {2002},
   address      = {McLean, {US}},
   pages        = {348 - 355},
   url          = {http://doi.acm.org/10.1145/584792.584850},
   abstract     = {This paper studies the effects of boosting in the context of 
                   different classification methods for text categorization, 
                   including Decision Trees, Naive Bayes, Support Vector Machines 
                   (SVMs) and a Rocchio-style classifier. We identify the inductive 
                   biases of each classifier and explore how boosting, as an 
                   error-driven resampling mechanism, reacts to those biases. Our 
                   experiments on the Reuters-21578 benchmark show that boosting is 
                   not effective in improving the performance of the base 
                   classifiers on common categories. However, the effect of boosting 
                   for rare categories varies across classifiers: for SVMs and 
                   Decision Trees, we achieved a 13-17\% performance improvement in 
                   macro-averaged F1 measure, but did not obtain substantial 
                   improvement for the other two classifiers. This interesting 
                   finding of boosting on rare categories has not been reported 
                   before.},
}
@article{Lodhi02,
   author       = {Huma Lodhi and Craig Saunders and John Shawe-Taylor and Nello 
                   Cristianini and Chris Watkins},
   title        = {Text Classification using String Kernels},
   journal      = {Journal of Machine Learning Research},
   volume       = {2},
   pages        = {419--444},
   year         = {2002},
   url          = {http://www.ai.mit.edu/projects/jmlr/papers/volume2/lodhi02a/lodhi02a.pdf},
   abstract     = {We propose a novel approach for categorizing text documents based 
                   on the use of a special kernel. The kernel is an inner product in 
                   the feature space generated by all subsequences of length k. A 
                   subsequence is any ordered sequence of k characters occurring in 
                   the text though not necessarily contiguously. The subsequences 
                   are weighted by an exponentially decaying factor of their full 
                   length in the text, hence emphasising those occurrences that are 
                   close to contiguous. A direct computation of this feature vector 
                   would involve a prohibitive amount of computation even for modest 
                   values of k, since the dimension of the feature space grows 
                   exponentially with k. The paper describes how despite this fact 
                   the inner product can be efficiently evaluated by a dynamic 
                   programming technique. Experimental comparisons of the 
                   performance of the kernel compared with a standard word feature 
                   space kernel (Joachims, 1998) show positive results on modestly 
                   sized datasets. The case of contiguous subsequences is also 
                   considered for comparison with the subsequences kernel with 
                   different decay factors. For larger documents and datasets the 
                   paper introduces an approximation technique that is shown to 
                   deliver good approximations efficiently for large datasets.},
}
@inProceedings{Macskassy01,
   author       = {Sofus A. Macskassy and Haym Hirsh and Arunava Banerjee and Aynur 
                   A. Dayanik},
   title        = {Using Text Classifiers for Numerical Classification},
   booktitle    = {Proceeding of IJCAI-01, 17th International Joint Conference on 
                   Artificial Intelligence},
   editor       = {Bernhard Nebel},
   address      = {Seattle, {US}},
   year         = {2001},
   pages        = {885--890},
   url          = {http://www.cs.rutgers.edu/~sofmac/paper/ijcai2001/macskassy-ijcai2001.pdf},
   abstract     = {Consider a supervised learning problem in which examples contain 
                   both numerical- and text-valued features. To use traditional 
                   feature-vector- based learning methods, one could treat the 
                   presence or ab-sence of a word as a Boolean feature and use these 
                   binary-valued features together with the numerical features. 
                   However, the use of a text-classification system on this is a bit 
                   more problematic ‹ in the most straight-forward approach each 
                   number would be considered a distinct token and treated as a 
                   word. This paper presents an alter-native approach for the use of 
                   text classification methods for super-vised learning problems 
                   with numerical-valued features in which the numerical features 
                   are converted into bag-of-words features, thereby making them 
                   directly usable by text classification methods. We show that even 
                   on purely numerical-valued data the results of 
                   text-classification on the derived text-like representation 
                   outperforms the more naive numbers-as-tokens representation and, 
                   more importantly, is competitive with mature numerical 
                   classification methods such as C4.5 and Ripper.},
}
@article{Maderlechner97,
   author       = {Maderlechner, G. and Suda, P. and Bruckner, T.},
   title        = {Classification of documents by form and content},
   journal      = {Pattern Recognition Letters},
   pages        = {1225--1231},
   year         = {1997},
   volume       = {18},
   number       = {11/13},
   url          = {},
   abstract     = {This paper presents a modular software system, which classifies a 
                   large variety of office documents according to layout form and 
                   textual content. It consists of the following components: layout 
                   analysis, pre-classification, OCR interface, fuzzy string 
                   matching, text categorization, lexical, syntactical and semantic 
                   analysis. The system has been applied to the following tasks: 
                   presorting of forms, reports and letters, index extraction for 
                   archiving and retrieval, page type classification and text column 
                   analysis of real estate register documents, in-house mail sorting 
                   and electronic distribution to departments. The architecture, 
                   modules, and practical results are described.},
}
@article{Manevitz01,
   author       = {Larry M. Manevitz and Malik Yousef},
   title        = {One-Class {SVMs} for Document Classification},
   journal      = {Journal of Machine Learning Research},
   volume       = {2},
   month        = {December},
   pages        = {139--154},
   year         = {2001},
   url          = {http://www.ai.mit.edu/projects/jmlr/papers/volume2/manevitz01a/manevitz01a.pdf},
   abstract     = {We implemented versions of the SVM appropriate for one-class 
                   classification in the context of information retrieval. The 
                   experiments were conducted on the standard Reuters data set. For 
                   the SVM implementation we used both a version of Schoelkopf et 
                   al. and a somewhat different version of one-class SVM based on 
                   identifying ``outlier" data as representative of the 
                   second-class. We report on experiments with different kernels for 
                   both of these implementations and with different representations 
                   of the data, including binary vectors, tf-idf representation and 
                   a modification called ``Hadamard" representation. Then we 
                   compared it with one-class versions of the algorithms prototype 
                   (Rocchio), nearest neighbor, naive Bayes, and finally a natural 
                   one-class neural network classification method based on 
                   ``bottleneck" compression generated filters. The SVM approach as 
                   represented by Schoelkopf was superior to all the methods except 
                   the neural network one, where it was, although occasionally 
                   worse, essentially comparable. However, the SVM methods turned 
                   out to be quite sensitive to the choice of representation and 
                   kernel in ways which are not well understood; therefore, for the 
                   time being leaving the neural network approach as the most 
                   robust.},
}
@inBook{Manning99a,
   author       = {Christopher Manning and Hinrich Sch{\"{u}}tze},
   title        = {Foundations of Statistical Natural Language Processing},
   publisher    = {The {MIT} Press},
   address      = {Cambridge, {US}},
   year         = {1999},
   chapter      = {16: Text Categorization},
   pages        = {575--608},
   url          = {},
   abstract     = {},
}
@article{Maron61,
   author       = {M.E. Maron},
   title        = {Automatic indexing: an experimental inquiry},
   year         = {1961},
   journal      = {Journal of the Association for Computing Machinery},
   volume       = {8},
   number       = {3},
   pages        = {404--417},
   url          = {http://www.acm.org/pubs/articles/journals/jacm/1961-8-3/p404-maron/p404-maron.pdf},
   abstract     = {This inquiry examines a technique for automatically classifying 
                   (indexing) documents according to their subject content. The 
                   task, in essence, is to have a computing machine read a document 
                   and on the basis of the occurrence of selected clue words decide 
                   to which of many subject categories the document in question 
                   belongs. This paper describes the design, execution and 
                   evaluation of a modest experimental study aimed at testing 
                   empirically one statistical technique for automatic indexing.},
}
@inProceedings{Masand92,
   author       = {Briji Masand and Gordon Linoff and David Waltz},
   title        = {Classifying news stories using memory-based reasoning},
   booktitle    = {Proceedings of SIGIR-92, 15th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Nicholas J. Belkin and Peter Ingwersen and Annelise Mark 
                   Pejtersen},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Kobenhavn, {DK}},
   pages        = {59--65},
   year         = {1992},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/133160/p59-masand/p59-masand.pdf},
   abstract     = {We describe a method for classifying news stories using Memory 
                   Based Reasoning (MBR) a k-nearest neighbor method), that does not 
                   require manual topic definitions. Using an already coded training 
                   database of about 50,000 stories from the Dow Jones Press Release 
                   News Wire, and SEEKER [Stanfill] (a text retrieval system that 
                   supports relevance feedback) as the underlying match engine, 
                   codes are assigned to new, unseen stories with a recall of about 
                   80\% and precision of about 70\%. There are about 350 different 
                   codes to be assigned. Using a massively parallel supercomputer, 
                   we leverage the information already contained in the thousands of 
                   coded stories and are able to code a story in about 2 seconds. 
                   Given SEEKER, the text retrieval system, we achieved these 
                   results in about two person-months. We believe this approach is 
                   effective in reducing the development time to implement 
                   classification systems involving large number of topics for the 
                   purpose of classification, message routing etc.},
}
@inCollection{Masand94,
   author       = {Briji Masand},
   title        = {Optimising confidence of text classification by evolution of 
                   symbolic expressions},
   booktitle    = {Advances in genetic programming},
   publisher    = {The {MIT} Press},
   address      = {Cambridge, {US}},
   year         = {1994},
   chapter      = {21},
   editor       = {Kenneth E. Kinnear},
   pages        = {459--476},
   url          = {},
   abstract     = {},
}
@inProceedings{Matsuda98,
   author       = {Katsushi Matsuda and Toshikazu Fukushima},
   title        = {Task-oriented {W}orld {W}ide {W}eb retrieval by document type 
                   classification},
   booktitle    = {Proceedings of CIKM-98, 7th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {Georges Gardarin and James C. French and Niki Pissinou and Kia 
                   Makki and Luc Bouganim},
   year         = {1998},
   address      = {Bethesda, {US}},
   pages        = {109--113},
   url          = {http://www.acm.org/pubs/articles/proceedings/cikm/319950/p109-matsuda/p109-matsuda.pdf},
   abstract     = {This paper proposes a novel approach to accurately searching Web 
                   pages for relevant information in problem solving by specifying a 
                   Web document category instead of the user¹s task. Accessing 
                   information from World Wide Web pages as an approach to problem 
                   solving has become commonplace. However, such a search is 
                   difficult with current search services, since these services only 
                   provide keyword-based search methods that are equivalent to 
                   narrowing down the target references according to domains. 
                   However, problem solving usually involves both a domain and a 
                   task. Accordingly,¹ our approach is based on problem solving 
                   tasks. To specify a user¹s problem solving task, we introduce the 
                   concept of document types that directly relate to the problem 
                   solving tasks; with this approach, users can easily designate 
                   problem solving tasks. We implemented PageTypeSearch system based 
                   on our approach. Classifier of PageTypeSearch classifies Web 
                   pages into the document types by comparing their pages with 
                   typical structural characteristics of the types. We compare 
                   PageTypeSearch using the document type-indices with a 
                   conventional keyword-based search system in experiments. The 
                   average precision of the document type-based search is 88.9\%, 
                   while the average precision of the keyword-based search is 
                   31.2\%. Moreover, the number of irrelevant references gathered by 
                   our system is about one-thirteenth that of traditional 
                   keyword-based search systems. Our approach has practical 
                   advantages for problem solving by introducing the viewpoint of 
                   tasks to achieve higher performance.},
}
@inProceedings{McCallum98,
   author       = {Andrew K. McCallum and Kamal Nigam},
   title        = {Employing {EM} in pool-based active learning for text 
                   classification},
   booktitle    = {Proceedings of ICML-98, 15th International Conference on Machine 
                   Learning},
   editor       = {Jude W. Shavlik},
   year         = {1998},
   address      = {Madison, {US}},
   pages        = {350--358},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www.cs.cmu.edu/~mccallum/papers/emactive-icml98.ps.gz},
   abstract     = {The paper shows how a text classifier's need for labeled training 
                   documents can be reduced by taking advantage of a large pool of 
                   unlabeled documents. We modify the Query-by-Committee (QBC) 
                   method of active learning to use the unlabeled pool for 
                   explicitly estimating document density when selecting examples 
                   for labeling. Then active learning is combined with 
                   Expectation-Maximization in order to ``fill in'' the class labels 
                   of those documents that remain unlabeled. Experimental results 
                   show that the improvements to active learning require less than 
                   two-thirds as many labeled training examples as previous QBC 
                   approaches, and that the combination of EM and active learning 
                   requires only slightly more than half as many labeled training 
                   examples to achieve the same accuracy as either the improved 
                   active learning or EM alone.},
}
@inProceedings{McCallum98b,
   author       = {Andrew K. McCallum and Ronald Rosenfeld and Tom M. Mitchell and 
                   Andrew Y. Ng},
   title        = {Improving text classification by shrinkage in a hierarchy of 
                   classes},
   booktitle    = {Proceedings of ICML-98, 15th International Conference on Machine 
                   Learning},
   editor       = {Jude W. Shavlik},
   year         = {1998},
   address      = {Madison, {US}},
   pages        = {359--367},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www.cs.cmu.edu/~mccallum/papers/hier-icml98.ps.gz},
   abstract     = {When documents are organized in a large number of topic 
                   categories, the categories are often arranged in a hierarchy. The 
                   US patent database and Yahoo are two examples. The paper shows 
                   that the accuracy of a naive Bayes text classifier can be 
                   significantly improved by taking advantage of a hierarchy of 
                   classes. We adopt an established statistical technique called 
                   shrinkage that smooths parameter estimates of a data-sparse child 
                   with its parent in order to obtain more robust parameter 
                   estimates. The approach is also employed in deleted 
                   interpolation, a technique for smoothing n-grams in language 
                   modeling for speech recognition. Our method scales well to large 
                   data sets, with numerous categories in large hierarchies. 
                   Experimental results on three real world data sets from UseNet, 
                   Yahoo, and corporate Web pages show improved performance, with a 
                   reduction in error up to 29\% over the traditional flat 
                   classifier.},
}
@inProceedings{Meretakis00,
   author       = {Dimitris Meretakis and Dimitris Fragoudis and Hongjun Lu and 
                   Spiros Likothanassis},
   title        = {Scalable Association-based Text Classification},
   booktitle    = {Proceedings of CIKM-00, 9th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {McLean, {US}},
   editor       = {Arvin Agah and Jamie Callan and Elke Rundensteiner},
   year         = {2000},
   pages        = {373--374},
   url          = {http://www.cs.ust.hk/~meretaks/papers/mfll-cikm2000.pdf},
   abstract     = {Naive Bayes (NB) classifier has long been considered a core 
                   methodology in text classification mainly due to its simplicity 
                   and computational efficiency. There is an increasing need however 
                   for methods that can achieve higher classification accuracy while 
                   maintaining the ability to process large document collections. In 
                   this paper we examine text categorization methods from a 
                   perspective that considers the tradeoff between accuracy and 
                   scalability to large data sets and large feature sizes. We start 
                   from the observation that Support Vector Machines, one of the 
                   best text categorization methods cannot scale up to handle the 
                   large document collections involved in many real word problems. 
                   We then consider bayesian extensions to NB that achieve higher 
                   accuracy by relaxing its strong independence assumptions. Our 
                   experimental results show that LB, an association-based lazy 
                   classifier can achieve a good tradeoff between high 
                   classification accuracy and scalability to large document 
                   collections and large feature sizes.},
}
@article{Merkl98,
   author       = {Merkl, Dieter},
   title        = {Text classification with self-organizing maps: Some lessons 
                   learned},
   journal      = {Neurocomputing},
   year         = {1998},
   volume       = {21},
   number       = {1/3},
   pages        = {61--77},
   url          = {},
   abstract     = {We discuss ways of using self-organizing maps for document 
                   classification. Furthermore, we focus on the fact that document 
                   collections lend themselves naturally to a hierarchical structure 
                   defined by the subject matter of the documents. We take advantage 
                   of this fact by using a hierarchically organized neural network, 
                   built up from a number of independent self-organizing maps in 
                   order to enable the true establishment of a document taxonomy. 
                   Using such an architecture, the time needed for training is 
                   reduced substantially and the user is provided with an even more 
                   intuitive metaphor for visualization. Since the single layers of 
                   self-organizing maps represent different aspects of the document 
                   collection at different levels of detail, the neural network 
                   shows the document collection in a form comparable to an atlas 
                   where the user may easily select the most appropriate degree of 
                   granularity depending on the actual focus of interest during the 
                   exploration of the document collection.},
}
@inProceedings{Mladenic98a,
   author       = {Dunja Mladeni{\'{c}}},
   title        = {Turning {{\sc Yahoo!}}\ into an automatic {W}eb page classifier},
   booktitle    = {Proceedings of ECAI-98, 13th European Conference on Artificial 
                   Intelligence},
   publisher    = {John Wiley and Sons, Chichester, {UK}},
   editor       = {Henri Prade},
   year         = {1998},
   pages        = {473--474},
   address      = {Brighton, {UK}},
   url          = {http://www-ai.ijs.si/DunjaMladenic/papers/PWW/pwwECAI98yr.ps.gz},
   abstract     = {The paper describes an approach to automatic Web-page 
                   classification based on the Yahoo hierarchy. Machine learning 
                   techniques developed for learning on text data are used here on 
                   the hierarchical classification structure. The high number of 
                   features is reduced by taking into account the hierarchical 
                   structure and using feature subset selection based on the method 
                   known from information retrieval. Documents are represented as 
                   feature-vectors that include n-grams instead of including only 
                   single words (unigrams) as commonly used when learning on text 
                   data. Based on the hierarchical structure the problem is divided 
                   into subproblems, each representing one on the categories 
                   included in the Yahoo hierarchy. The result of learning is a set 
                   of independent classifiers, each used to predict the probability 
                   that a new example is a member of the corresponding category. 
                   Experimental evaluation on real-world data shows that the 
                   proposed approach gives good results. For more than a half of 
                   testing examples a correct category is among the 3 categories 
                   with the highest predicted probability.},
}
@inProceedings{Mladenic98b,
   author       = {Dunja Mladeni{\'{c}}},
   title        = {Feature subset selection in text learning},
   booktitle    = {Proceedings of ECML-98, 10th European Conference on Machine 
                   Learning},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1398},
   editor       = {Claire N{\'{e}}dellec and C{\'{e}}line Rouveirol},
   address      = {Chemnitz, {DE}},
   pages        = {95--100},
   year         = {1998},
   url          = {http://www-ai.ijs.si/DunjaMladenic/papers/PWW/pwwECML98.ps.gz},
   abstract     = {This paper describes several known and some new methods for 
                   feature subset selection on large text data. Experimental 
                   comparison given on real-world data collected from Web users 
                   shows that characteristics of the problem domain and machine 
                   learning algorithm should be considered when feature scoring 
                   measure is selected. Our problem domain consists of hyperlinks 
                   given in a form of small-documents represented with word vectors. 
                   In our learning experiments naive Bayesian classifier was used on 
                   text data. The best performance was achieved by the feature 
                   selection methods based on the feature scoring measure called 
                   Odds ratio that is known from information retrieval.},
}
@phdThesis{Mladenic98c,
   author       = {Dunja Mladeni{\'{c}}},
   title        = {Machine Learning on non-homogeneous, distributed text data},
   school       = {J.\ Stefan Institute, University of Ljubljana},
   address      = {Ljubljana, {SL}},
   year         = {1998},
   url          = {http://www-ai.ijs.si/DunjaMladenic/papers/PhD/PhDFinal.ps},
   abstract     = {},
}
@article{Mladenic99,
   author       = {Dunja Mladeni{\'{c}}},
   title        = {Text learning and related intelligent agents: a survey},
   journal      = {{IEEE} Intelligent Systems},
   year         = {1999},
   number       = {4},
   volume       = {14},
   pages        = {44--54},
   url          = {http://www-ai.ijs.si/DunjaMladenic/papers/PWW/agentOverIEEE.ps.gz},
   abstract     = {Analysis of text data using intelligent information retrieval, 
                   machine learning, natural language processing or other related 
                   methods is becoming an important issue for the development of 
                   intelligent agents. There are two frequently used approaches to 
                   the development of intelligent agents using machine learning 
                   techniques: a content-based and a collaborative approach. In the 
                   first approach, the content (eg., text) plays an important role, 
                   while in the second approach, the existence of several knowledge 
                   sources (eg., several users) is required. We can say that the 
                   usage of machine learning techniques on text databases (usually 
                   referred to as text-learning) is an important part of the 
                   content-based approach. Examples are agents for locating 
                   information on World Wide Web and Usenet news filtering agents. 
                   There are different research questions important for the 
                   development of text-learning intelligent agents. We focus on 
                   three of them: what representation is used for documents, how is 
                   the high number of features dealt with and which learning 
                   algorithm is used. These questions are addressed in an overview 
                   of the existing approaches to text classification. For 
                   illustration we give a brief description of the content-based 
                   personal intelligent agent named Personal WebWatcher that uses 
                   text-learning for user customized Web browsing.},
}
@inProceedings{Mladenic98d,
   author       = {Dunja Mladeni{\'{c}} and Marko Grobelnik},
   title        = {Word sequences as features in text-learning},
   booktitle    = {Proceedings of ERK-98, the Seventh Electrotechnical and Computer 
                   Science Conference},
   year         = {1998},
   address      = {Ljubljana, {SL}},
   pages        = {145--148},
}
@inProceedings{Mladenic99a,
   author       = {Dunja Mladeni{\'{c}} and Marko Grobelnik},
   title        = {Feature selection for unbalanced class distribution and Naive 
                   {B}ayes},
   booktitle    = {Proceedings of ICML-99, 16th International Conference on Machine 
                   Learning},
   editor       = {Ivan Bratko and Saso Dzeroski},
   year         = {1999},
   address      = {Bled, {SL}},
   pages        = {258--267},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www-ai.ijs.si/DunjaMladenic/papers/PWW/pwwICML99Final.ps.gz},
   abstract     = {This paper describes an approach to feature subset selection that 
                   takes into account problem specifics and learning algorithm 
                   characteristics. It is developed for the Naive Bayesian 
                   classifier applied on text data, since it combines well with the 
                   addressed learning problems. We focus on domains with many 
                   features that also have a highly unbalanced class distribution 
                   and asymmetric misclassification costs given only implicitly in 
                   the problem. By asymmetric misclassification costs we mean that 
                   one of the class values is the target class value for which we 
                   want to get predictions and we prefer false positive over false 
                   negative. Our example problem is automatic document 
                   categorization using machine learning, where we want to identify 
                   documents relevant for the selected category. Usually, only about 
                   1\%-10\% of examples belong to the selected category. Our 
                   experimental comparison of eleven feature scoring measures show 
                   that considering domain and algorithm characteristics 
                   significantly improves the results of classification.},
}
@article{Mladenic03,
   author       = {Dunja Mladeni{\'{c}} and Marko Grobelnik},
   title        = {Feature selection on hierarchy of {W}eb documents},
   journal      = {Decision Support Systems},
   year         = {2003},
   number       = {1},
   volume       = {35},
   pages        = {45--87},
   url          = {},
   abstract     = {The paper describes feature subset selection used in learning on 
                   text data (text learning) and gives a brief overview of feature 
                   subset selection commonly used in machine learning. Several known 
                   and some new feature scoring measures appropriate for feature 
                   subset selection on large text data are described and related to 
                   each other. Experimental comparison of the described measures is 
                   given on real-world data collected from the Web. Machine learning 
                   techniques are used on data collected from Yahoo, a large text 
                   hierarchy of Web documents. Our approach includes some original 
                   ideas for handling large number of features, categories and 
                   documents. The high number of features is reduced by feature 
                   subset selection and additionally by using `stop-list', pruning 
                   low-frequency features and using a short description of each 
                   document given in the hierarchy instead of using the document 
                   itself. Documents are represented as feature-vectors that include 
                   word sequences instead of including only single words as commonly 
                   used when learning on text data. An efficient approach to 
                   generating word sequences is proposed. Based on the hierarchical 
                   structure, we propose a way of dividing the problem into 
                   subproblems, each representing one of the categories included in 
                   the Yahoo hierarchy. In our learning experiments, for each of the 
                   subproblems, naive Bayesian classifier was used on text data. The 
                   result of learning is a set of independent classifiers, each used 
                   to predict probability that a new example is a member of the 
                   corresponding category. Experimental evaluation on real-world 
                   data shows that the proposed approach gives good results. The 
                   best performance was achieved by the feature selection based on a 
                   feature scoring measure known from information retrieval called 
                   Odds ratio and using relatively small number of features.},
}
@article{Moens00,
   author       = {Marie-Francine Moens and Jos Dumortier},
   title        = {Text categorization: the assignment of subject descriptors to 
                   magazine articles},
   journal      = {Information Processing and Management},
   pages        = {841--861},
   year         = {2000},
   number       = {6},
   volume       = {36},
   url          = {},
   abstract     = {Automatic text categorization is an important research area and 
                   has a potential for many text-based applications including text 
                   routing and filtering. Typical text classifiers learn from 
                   example texts that are manually categorized. When categorizing 
                   magazine articles with broad subject descriptors, we study three 
                   aspects of text classification: (1) effective selection of 
                   feature words and proper names that reflect the main topics of 
                   the text; (2) learning algorithms; and (3) improvement of the 
                   quality of the learned classifier by selection of examples. The 
                   chi(2) test, which is sometimes used for selecting terms that are 
                   highly related to a text class, is applied in a novel way when 
                   constructing a category weight vector. Despite a limited number 
                   of training examples, combining an effective feature selection 
                   with the chi(2) learning algorithm for training the text 
                   classifier results in an adequate categorization of new magazine 
                   articles.},
}
@inProceedings{Mooney00,
   author       = {Raymond J. Mooney and Loriene Roy},
   title        = {Content-based book recommending using learning for text 
                   categorization},
   booktitle    = {Proceedings of DL-00, 5th ACM Conference on Digital Libraries},
   editor       = {},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {2000},
   address      = {San Antonio, {US}},
   pages        = {195--204},
   url          = {ftp://ftp.cs.utexas.edu/pub/mooney/papers/libra-dl-00.ps.gz},
   abstract     = {Recommender systems improve access to relevant products and 
                   information by making personalized suggestions based on previous 
                   examples of a user's likes and dislikes. Most existing 
                   recommender systems use collaborative filtering methods that base 
                   recommendations on other users' preferences. By contrast, 
                   content-based methods use information about an item itself to 
                   make suggestions. This approach has the advantage of being able 
                   to recommend previously unrated items to users with unique 
                   interests and to provide explanations for its recommendations. We 
                   describe a content-based book recommending system that utilizes 
                   information extraction and a machine-learning algorithm for text 
                   categorization. Initial experimental results demonstrate that 
                   this approach can produce accurate recommendations.},
}
@inProceedings{Moschitti03,
   author       = {Alessandro Moschitti},
   title        = {A study on optimal parameter tuning for {R}occhio text classifier},
   booktitle    = {Proceedings of ECIR-03, 25th European Conference on Information 
                   Retrieval},
   publisher    = {Springer Verlag},
   editor       = {Fabrizio Sebastiani},
   address      = {Pisa, {IT}},
   year         = {2003},
   pages        = {420--435},
   url          = {http://link.springer.de/link/service/series/0558/papers/2633/26330420.pdf},
   abstract     = {Current trend in operational text categorization is the designing 
                   of fast classification tools. Several studies on improving 
                   accuracy of fast but less accurate classifiers have been recently 
                   carried out. In particular, enhanced versions of the Rocchio text 
                   classifier, characterized by high performance, have been 
                   proposed. However, even in these extended formulations the 
                   problem of tuning its parameters is still neglected. In this 
                   paper, a study on parameters of the Rocchio text classifier has 
                   been carried out to achieve its maximal accuracy. The result is a 
                   model for the automatic selection of parameters. Its main feature 
                   is to bind the searching space so that optimal parameters can be 
                   selected quickly. The space has been bound by giving a feature 
                   selection interpretation of the Rocchio parameters. The benefit 
                   of the approach has been assessed via extensive cross evaluation 
                   over three corpora in two languages. Comparative analysis shows 
                   that the performances achieved are relatively close to the best 
                   TC models (e.g. Support Vector Machines).},
}
@article{Mostafa00,
   author       = {Javed Mostafa and Wai Lam},
   title        = {Automatic classification using supervised learning in a medical 
                   document filtering application},
   journal      = {Information Processing and Management},
   year         = {2000},
   volume       = {36},
   number       = {3},
   pages        = {415--444},
   url          = {},
   abstract     = {Document classifiers can play an intermediate role in multilevel 
                   filtering systems. The effectiveness of a classifier that uses 
                   supervised learning was analyzed in terms of its accuracy and 
                   ultimately its influence on filtering. The analysis was conducted 
                   in two phases. In the first phase, a multilayer feed-forward 
                   neural network was trained to classify medical documents in the 
                   area of cell biology. The accuracy of the supervised classifier 
                   was established by comparing its performance with a baseline 
                   system that uses human classification information. A relatively 
                   high degree of accuracy was achieved by the supervised method, 
                   however, classification accuracy varied across classes. In the 
                   second phase, to clarify the impact of this performance on 
                   filtering, different types of user profiles were created by 
                   grouping subsets of classes based on their individual 
                   classification accuracy rates. Then, a filtering system with the 
                   neural network integrated into it was used to filter the medical 
                   documents and this performance was compared with the filtering 
                   results achieved using the baseline system. The performance of 
                   the system using the neural network classifier was generally 
                   satisfactory and, as expected, the filtering performance varied 
                   with regard to the accuracy rates of classes.},
}
@inProceedings{Moulinier96a,
   author       = {Isabelle Moulinier and Jean-Gabriel Ganascia},
   title        = {Applying an existing machine learning algorithm to text 
                   categorization},
   booktitle    = {Connectionist, statistical, and symbolic approaches to learning 
                   for natural language processing},
   editor       = {Stefan Wermter and Ellen Riloff and Gabriele Scheler},
   pages        = {343--354},
   year         = {1996},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1040},
   url          = {http://www-poleia.lip6.fr/~moulinie/wijcai.ps.gz},
   abstract     = {The information retrieval community is becoming increasingly 
                   interested in machine learning techniques, of which text 
                   categorization is an application. This paper describes how we 
                   have applied an existing similarity-based learning algorithm, 
                   CHARADE, to the text categorization problem and compares the 
                   results with those obtained using decision tree construction 
                   algorithms. From a machine learning point of view, this study was 
                   motivated by the size of the inspected data in such applications. 
                   Using the same representation of documents, CHARADE offers better 
                   performance than earlier reported experiments with decision trees 
                   on the same corpus. In addition, the way in which learning with 
                   redundancy influences categorization performance is also studied.},
}
@inProceedings{Moulinier96,
   author       = {Isabelle Moulinier and Gailius Ra{\u{s}}kinis and Jean-Gabriel 
                   Ganascia},
   title        = {Text categorization: a symbolic approach},
   booktitle    = {Proceedings of SDAIR-96, 5th Annual Symposium on Document 
                   Analysis and Information Retrieval},
   publisher    = {},
   editor       = {},
   address      = {Las Vegas, {US}},
   year         = {1996},
   pages        = {87--99},
   url          = {http://www-poleia.lip6.fr/~moulinie/sdair.ps.gz},
   abstract     = {Recent research in machine learning has been concerned with 
                   scaling-up to large data sets. Since information retrieval is a 
                   domain where such data sets are widespread, it provides an ideal 
                   application area for machine learning. This paper studies the 
                   ability of symbolic learning algorithms to perform a text 
                   categorization task. This ability depends on both text 
                   representation and feature filtering. We present a unified view 
                   of text categorization systems, focusing on the selection of 
                   features. A new selection technique, SCAR, is proposed for k-DNF 
                   (disjunctive normal form) learners and evaluated on the Reuters 
                   financial data set. Even though our experimental results do not 
                   outperform earlier approaches, they give rise to promising 
                   perspectives.},
}
@inProceedings{Moulinier97,
   author       = {Isabelle Moulinier},
   title        = {Feature selection: a useful preprocessing step},
   booktitle    = {Proceedings of BCSIRSG-97, the 19th Annual Colloquium of the 
                   British Computer Society Information Retrieval Specialist Group},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   series       = {Electronic Workshops in Computing},
   editor       = {Jonathan Furner and David Harper},
   address      = {Aberdeen, {UK}},
   year         = {1997},
   pages        = {},
   url          = {http://www.ewic.org.uk/ewic/workshop/fetch.cfm/IRR-97/Moulinier/Moulinier.ps},
   abstract     = {Statistical classification techniques and machine learning 
                   methods have been applied to some information retrieval (IR) 
                   problems: routing, filtering and categorization. Most of these 
                   methods are usually awkward and sometimes intractable in 
                   high-dimensional feature spaces. In order to reduce 
                   dimensionality, feature selection has been introduced as a 
                   preprocessing step. In this paper, we assess to what extent 
                   feature selection can be used without causing a loss in 
                   effectiveness. This problem can be tackled since a couple of 
                   recent learners (Ripper and Scar) do not require a preprocessing 
                   step. On a text categorization task, using the Reuters-22,173 
                   collection, we give empirical evidence that feature selection is 
                   useful: first, the size of the collection index can be 
                   drastically reduced without causing a significant loss in 
                   categorization effectiveness. Then, we show that feature 
                   selection speeds up the time required to automatically build the 
                   categorization system.},
}
@inProceedings{Myers00,
   author       = {Kary Myers and Michael Kearns and Satinder Singh and Marilyn A. 
                   Walker},
   title        = {A Boosting Approach to Topic Spotting on Subdialogues},
   booktitle    = {Proceedings of ICML-00, 17th International Conference on Machine 
                   Learning},
   editor       = {Pat Langley},
   year         = {2000},
   address      = {Stanford, {US}},
   pages        = {655--662},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www.cs.cmu.edu/~rayid/mypapers/ecoc-icml.ps},
   abstract     = {We report the results of a study on topic spotting in 
                   conversational speech. Using a machine learning approach, we 
                   build classifiers that accept an audio file of conversational 
                   human speech as input, and output an estimate of the topic being 
                   discussed. Our methodology makes use of a well-known corpus of 
                   transcribed and topic-labeled speech (the Switchboard corpus), 
                   and involves an interesting double use of the BOOSTEXTER learning 
                   algorithm. Our work is distinguished from previous efforts in 
                   topic spotting by our explicit study of the effects of dialogue 
                   length on classifier performance, and by our use of off-the-shelf 
                   speech recognition technology. One of our main results is the 
                   identification of a single classifier with good performance 
                   (relative to our classifier space) across all subdialogue 
                   lengths.},
}
@inProceedings{Nardiello03,
   author       = {Pio Nardiello and Fabrizio Sebastiani and Alessandro Sperduti},
   title        = {Discretizing continuous attributes in {A}da{B}oost for text 
                   categorization},
   booktitle    = {Proceedings of ECIR-03, 25th European Conference on Information 
                   Retrieval},
   publisher    = {Springer Verlag},
   editor       = {Fabrizio Sebastiani},
   address      = {Pisa, {IT}},
   year         = {2003},
   pages        = {320--334},
   url          = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/ECIR03.pdf},
   abstract     = {We focus on two recently proposed algorithms in the family of 
                   ``boosting''-based learners for automated text classification, 
                   \textsc{AdaBoost.MH} and \textsc{AdaBoost.MH$^{KR}$}. While the 
                   former is a realization of the well-known \textsc{AdaBoost} 
                   algorithm specifically aimed at multi-label text categorization, 
                   the latter is a generalization of the former based on the idea of 
                   learning a committee of classifier sub-committees. Both 
                   algorithms have been among the best performers in text 
                   categorization experiments so far. A problem in the use of both 
                   algorithms is that they require documents to be represented by 
                   binary vectors, indicating presence or absence of the terms in 
                   the document. As a consequence, these algorithms cannot take full 
                   advantage of the ``weighted'' representations (consisting of 
                   vectors of continuous attributes) that are customary in 
                   information retrieval tasks, and that provide a much more 
                   significant rendition of the document's content than binary 
                   representations. In this paper we address the problem of 
                   exploiting the potential of weighted representations in the 
                   context of \textsc{AdaBoost}-like algorithms by discretizing the 
                   continuous attributes through the application of entropy-based 
                   discretization methods. We present experimental results on the 
                   \textsf{Reuters-21578} text categorization collection, showing 
                   that for both algorithms the version with discretized continuous 
                   attributes outperforms the version with traditional binary 
                   representations.},
}
@inProceedings{Ng97,
   author       = {Hwee T. Ng and Wei B. Goh and Kok L. Low},
   title        = {Feature selection, perceptron learning, and a usability case 
                   study for text categorization},
   booktitle    = {Proceedings of SIGIR-97, 20th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Nicholas J. Belkin and A. Desai Narasimhalu and Peter Willett},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1997},
   address      = {Philadelphia, {US}},
   pages        = {67--73},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/258525/p67-ng/p67-ng.pdf},
   abstract     = {In this paper, we describe an automated learning approach to text 
                   categorization based on perceptron learning and a new feature 
                   selection metric, called correlation coefficient. Our approach 
                   has been tested on the standard Reuters text categorization 
                   collection. Empirical results indicate that our approach 
                   outperforms the best published results on this Reuters 
                   collection. In particular, our new feature selection method 
                   yields considerable improvement. We also investigate the 
                   usability of our automated learning approach by actually 
                   developing a system that categorizes texts into a tree of 
                   categories. We compare the accuracy of our learning approach to a 
                   rule-based, expert system approach that uses a text 
                   categorization shell built by Carnegie Group. Although our 
                   automated learning approach still gives a lower accuracy, by 
                   appropriately incorporating a set of manually chosen words to use 
                   as features, the combined, semi-automated approach yields 
                   accuracy close to the rule-based approach.},
}
@article{Nieto02,
   author       = {Salvador Nieto S{\'{a}}nchez and Evangelos Triantaphyllou and 
                   Donald Kraft},
   title        = {A feature mining based approach for the classification of text 
                   documents into disjoint classes},
   journal      = {Information Processing and Management},
   year         = {2002},
   volume       = {38},
   number       = {4},
   pages        = {583--604},
   url          = {},
   abstract     = {This paper proposes a new approach for classifying text documents 
                   into two disjoint classes. The new approach is based on 
                   extracting patterns, in the form of two logical expressions, 
                   which are defined on various features (indexing terms) of the 
                   documents. The pattern extraction is aimed at providing 
                   descriptions (in the form of two logical expressions) of the two 
                   classes of positive and negative examples. This is achieved by 
                   means of a data mining approach, called One Clause At a Time 
                   (OCAT), which is based on mathematical logic. The application of 
                   a logic-based approach to text document classification is 
                   critical when one wishes to be able to justify why a particular 
                   document has been assigned to one class versus the other class. 
                   This situation occurs, for instance, in declassifying documents 
                   that have been previously considered important to national 
                   security and thus are currently being kept as secret. Some 
                   computational experiments have investigated the effectiveness of 
                   the OCAT-based approach and compared it to the well-known vector 
                   space model (VSM). These tests also have investigated finding the 
                   best indexing terms that could be used in making these 
                   classification decisions. The results of these computational 
                   experiments on a sample of 2897 text documents from the TIPSTER 
                   collection indicate that the first approach has many advantages 
                   over the VSM approach for solving this type of text document 
                   classification problem. Moreover, a guided strategy for the 
                   OCAT-based approach is presented for deciding which document one 
                   needs to consider next while building the training example sets.},
}
@inProceedings{Nigam98,
   author       = {Kamal Nigam and Andrew K. McCallum and Sebastian Thrun and Tom M. 
                   Mitchell},
   title        = {Learning to classify text from labeled and unlabeled documents},
   booktitle    = {Proceedings of AAAI-98, 15th Conference of the American 
                   Association for Artificial Intelligence},
   publisher    = {{AAAI} Press, Menlo Park, {US}},
   editor       = {},
   year         = {1998},
   pages        = {792--799},
   address      = {Madison, {US}},
   note         = {An extended version appears as~\cite{Nigam00}},
   url          = {http://www.cs.cmu.edu/~knigam/papers/emcat-aaai98.ps},
   abstract     = {In many important text classification problems, acquiring class 
                   labels for training documents is costly, while gathering large 
                   quantities of unlabeled data is cheap. This paper shows that the 
                   accuracy of text classifiers trained with a small number of 
                   labeled documents can be improved by augmenting this small 
                   training set with a large pool of unlabeled documents. We present 
                   a theoretical argument showing that, under common assumptions, 
                   unlabeled data contain information about the target function. We 
                   then introduce an algorithm for learning from labeled and 
                   unlabeled text based on the combination of 
                   Expectation-Maximization with a naive Bayes classifier. The 
                   algorithm first trains a classifier using the available labeled 
                   documents, and probabilistically labels the unlabeled documents; 
                   it then trains a new classifier using the labels for all the 
                   documents, and iterates to convergence. Experimental results, 
                   obtained using text from three different real-world tasks, show 
                   that the use of unlabeled data reduces classification error by up 
                   to 33\%.},
}
@inProceedings{Nigam00a,
   author       = {Kamal Nigam and Rayid Ghani},
   title        = {Analyzing the applicability and effectiveness of co-training},
   booktitle    = {Proceedings of CIKM-00, 9th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {McLean, {US}},
   editor       = {Arvin Agah and Jamie Callan and Elke Rundensteiner},
   year         = {2000},
   pages        = {86--93},
   url          = {http://www.cs.cmu.edu/~knigam/papers/cotrain-CIKM00.pdf},
   abstract     = {Recently there has been significant interest in supervised 
                   learning algorithms that combine labeled and unlabeled data for 
                   text learning tasks. The co-training setting applies to datasets 
                   that have a natural separation of their features into two 
                   disjoint sets. We demonstrate that when learning from labeled and 
                   unlabeled data, algorithms explicitly leveraging a natural 
                   independent split of the features outperform algorithms that do 
                   not. When a natural split does not exist, co-training algorithms 
                   that manufacture a feature split may outperform algorithms not 
                   using a split. These results help explain why co-training 
                   algorithms are both discriminative in nature and robust to the 
                   assumptions of their embedded classifiers.},
}
@article{Nigam00,
   author       = {Kamal Nigam and Andrew K. McCallum and Sebastian Thrun and Tom M. 
                   Mitchell},
   title        = {Text Classification from Labeled and Unlabeled Documents using 
                   {EM}},
   journal      = {Machine Learning},
   year         = {2000},
   number       = {2/3},
   volume       = {39},
   pages        = {103--134},
   url          = {http://www.cs.cmu.edu/~knigam/papers/emcat-mlj99.ps},
   abstract     = {This paper shows that the accuracy of learned text classifiers 
                   can be improved by augmenting a small number of labeled training 
                   documents with a large pool of unlabeled documents. This is 
                   important because in many text classification problems obtaining 
                   training labels is expensive, while large quantities of unlabeled 
                   documents are readily available. We introduce an algorithm for 
                   learning from labeled and unlabeled documents based on the 
                   combination of Expectation-Maximization (EM) and a naive Bayes 
                   classifier. The algorithm first trains a classifier using the 
                   available labeled documents, and probabilistically labels the 
                   unlabeled documents. It then trains a new classifier using the 
                   labels for all the documents, and iterates to convergence. This 
                   basic EM procedure works well when the data conform to the 
                   generative assumptions of the model. However these assumptions 
                   are often violated in practice, and poor performance can result. 
                   We present two extensions to the algorithm that improve 
                   classification accuracy under these conditions: (1) a weighting 
                   factor to modulate the contribution of the unlabeled data, and 
                   (2) the use of multiple mixture components per class. 
                   Experimental results, obtained using text from three different 
                   real-world tasks, show that the use of unlabeled data reduces 
                   classification error by up to 30\%.},
}
@phdThesis{Nigam01,
   author       = {Kamal Nigam},
   title        = {Using Unlabeled Data to Improve Text Classification},
   school       = {Computer Science Department, Carnegie Mellon University},
   address      = {Pittsburgh, {US}},
   year         = {2001},
   url          = {http://www-2.cs.cmu.edu/~knigam/papers/thesis-nigam.pdf},
   abstract     = {One key difficulty with text classification learning algorithms 
                   is that they require many hand-labeled examples to learn 
                   accurately. This disser- tation demonstrates that supervised 
                   learning algorithms that use a small number of labeled examples 
                   and many inexpensive unlabeled examples can create high-accuracy 
                   text classifiers. By assuming that documents are created by a 
                   parametric generative model, Expectation-Maximization (EM) finds 
                   local maximum a posteriori models and classifiers from all the 
                   data|labeled and unlabeled. These generative models do not 
                   capture all the intricacies of text; however on some domains this 
                   technique substan- tially improves classification accuracy, 
                   especially when labeled data are sparse. Two problems arise from 
                   this basic approach. First, unlabeled data can hurt performance 
                   in domains where the generative modeling assumptions are too 
                   strongly violated. In this case the assumptions can be made more 
                   representative in two ways: by modeling sub-topic class 
                   structure, and by modeling super-topic hierarchical class 
                   relationships. By doing so, model probability and classification 
                   accuracy come into correspondence, allowing unlabeled data to 
                   improve classification performance. The second problem is that 
                   even with a representative model, the improvements given by 
                   unlabeled data do not sufficiently compensate for a paucity of 
                   labeled data. Here, limited labeled data provide EM 
                   initializations that lead to low-probability models. Performance 
                   can be significantly improved by using active learning to select 
                   high-quality initializations, and by using alternatives to EM 
                   that avoid low-probability local maxima.},
}
@inProceedings{Oh00,
   author       = {Hyo-Jung Oh and Sung Hyon Myaeng and Mann-Ho Lee},
   title        = {A practical hypertext categorization method using links and 
                   incrementally available class information},
   booktitle    = {Proceedings of SIGIR-00, 23rd ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Athens, {GR}},
   year         = {2000},
   pages        = {264--271},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/345508/p264-oh/p264-oh.pdf},
   abstract     = {As WWW grows at an increasing speed, a classifier targeted at 
                   hypertext has become in high demand. While document 
                   categorization is quite a mature, the issue of utilizing 
                   hypertext structure and hyperlinks has been relatively 
                   unexplored. In this paper, we propose a practical method for 
                   enhancing both the speed and the quality of hypertext 
                   categorization using hyperlinks. In comparison against a recently 
                   proposed technique that appears to be the only one of the kind, 
                   we obtained up to 18.5\% of improvement in effectiveness while 
                   reducing the processing time dramatically. We attempt to explain 
                   through experiments what factors contribute to the improvement.},
}
@inProceedings{Ontrup01,
   author       = {J{\"{o}}rg Ontrup and Helge Ritter},
   title        = {Text Categorization and Semantic Browsing with Self-Organizing 
                   Maps on Non-{E}uclidean Spaces},
   booktitle    = {Proceedings of PKDD-01, 5th European Conference on Principles and 
                   Practice of Knowledge Discovery in Databases},
   editor       = {Luc De Raedt and Arno Siebes},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Freiburg, {DE}},
   year         = {2001},
   pages        = {338--349},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2168},
   url          = {http://www.techfak.uni-bielefeld.de/ags/ni/publications/papers/OntrupRitter2001-TCA.pdf},
   abstract     = {This paper introduces a new type of Self-Organizing Map (SOM) for 
                   Text Categorization and Semantic Browsing. We propose a 
                   ``hyperbolic SOM'' (HSOM) based on a regular tesselation of the 
                   hyperbolic plane, which is a non-euclidean space characterized by 
                   constant negative gaussian curvature. This approach is motivated 
                   by the observation that hyperbolic spaces possess a geometry 
                   where the size of a neighborhood around a point increases 
                   exponentially and therefore provides more freedom to map a 
                   complex information space such as language into spatial 
                   relations. These theoretical findings are supported by our 
                   experiments, which show that hyperbolic SOMs can successfully be 
                   applied to text categorization and yield results comparable to 
                   other state-of-the-art methods. Furthermore we demonstrate that 
                   the HSOM is able to map large text collections in a semantically 
                   meaningful way and therefore allows a ``semantic browsing'' of 
                   text databases.},
}
@article{Paijmans98,
   author       = {Paijmans, Hans},
   title        = {Text categorization as an information retrieval task},
   journal      = {The South African Computer Journal},
   year         = {1999},
   pages        = {4--15},
   volume       = {},
   number       = {21},
   url          = {},
   abstract     = {A number of methods for feature reduction and feature selection 
                   in text classification and information retrieval systems are 
                   compared. These include feature sets that are constructed by 
                   Latent Semantic Indexing, `local dictionaries' in the form of the 
                   words that score highest in frequency in positive class examples 
                   and feature sets that are constructed by relevance feedback 
                   strategies such as J.J. Rocchio's (1971) feedback algorithm or 
                   genetic algorithms. Also, different derivations from the normal 
                   recall and precision performance indicators are discussed and 
                   compared. It was found that categorizers consisting of the words 
                   with highest tf.idf values scored best.},
}
@inProceedings{Paliouras99,
   author       = {Georgios Paliouras and Vangelis Karkaletsis and Constantine D. 
                   Spyropoulos},
   title        = {Learning rules for large vocabulary word sense disambiguation},
   booktitle    = {Proceedings of IJCAI-99, 16th International Joint Conference on 
                   Artificial Intelligence},
   editor       = {Thomas Dean},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   year         = {1999},
   pages        = {674--679},
   address      = {Stockholm, {SE}},
   url          = {http://www.iit.demokritos.gr/~paliourg/papers/IJCAI99.ps.gz},
   abstract     = {Word Sense Disambiguation (WSD) is the process of distinguishing 
                   between different senses of a word. In general, the 
                   disambiguation rules differ for different words. For this reason, 
                   the automatic construction of disambiguation rules is highly 
                   desirable. One way to achieve this aim is by applying machine 
                   learning techniques to training data containing the various 
                   senses of the ambiguous words. In the work presented here, the 
                   decision tree learning algorithm C4.5 is applied on a corpus of 
                   financial news articles. Instead of concentrating on a small set 
                   of ambiguous words, as done in most of the related previous work, 
                   all content words of the examined corpus are disambiguated. 
                   Furthermore, the effectiveness of word sense disambiguation for 
                   different parts of speech (nouns and verbs) is examined 
                   empirically.},
}
@inProceedings{Peng03,
   author       = {Fuchun Peng and Dale Schuurmans},
   title        = {Combining naive {B}ayes $n$-gram and language models for text 
                   classification},
   booktitle    = {Proceedings of ECIR-03, 25th European Conference on Information 
                   Retrieval},
   publisher    = {Springer Verlag},
   editor       = {Fabrizio Sebastiani},
   address      = {Pisa, {IT}},
   year         = {2003},
   pages        = {335--350},
   url          = {http://link.springer.de/link/service/series/0558/papers/2633/26330335.pdf},
   abstract     = {We augment the naive Bayes model with an n-gram language model to 
                   address two shortcomings of naive Bayes text classifiers. The 
                   chain augmented naive Bayes classifiers we propose have two 
                   advantages over standard naive Bayes classifiers. First, a chain 
                   augmented naive Bayes model relaxes some of the independence 
                   assumptions of naive Bayes--allowing a local Markov chain 
                   dependence in the observed variables--while still permitting 
                   efficient inference and learning. Second, smoothing techniques 
                   from statistical language modeling can be used to recover better 
                   estimates than the Laplace smoothing techniques usually used in 
                   naive Bayes classification. Our experimental results on three 
                   real world data sets show that we achieve substantial 
                   improvements over standard naive Bayes classification, while also 
                   achieving state of the art performance that competes with the 
                   best known methods in these cases.},
}
@inProceedings{Peng03a,
   author       = {Fuchun Peng and Dale Schuurmans and Shaojun Wang},
   title        = {Language and Task Independent Text Categorization with Simple 
                   Language Models},
   booktitle    = {Proceedings of HLT-03, 3rd Human Language Technology Conference},
   publisher    = {},
   editor       = {},
   address      = {Edmonton, {CA}},
   year         = {2003},
   pages        = {},
   url          = {},
   abstract     = {},
}
@inProceedings{Petasis00,
   author       = {Georgios Petasis and Alessandro Cucchiarelli and Paola Velardi 
                   and Georgios Paliouras and Vangelis Karkaletsis and Constantine 
                   D. Spyropoulos},
   title        = {Automatic adaptation of proper noun dictionaries through 
                   cooperation of machine learning and probabilistic methods},
   booktitle    = {Proceedings of SIGIR-00, 23rd ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Athens, {GR}},
   year         = {2000},
   pages        = {128--135},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/345508/p128-petasis/p128-petasis.pdf},
   abstract     = {The recognition of Proper Nouns (PNs) is considered an important 
                   task in the area of Information Retrieval and Extraction. However 
                   the high performance of most existing PN classifiers heavily 
                   depends upon the availability of large dictionaries of 
                   domain-specific Proper Nouns, and a certain amount of manual work 
                   for rule writing or manual tagging. Though it is not a heavy 
                   requirement to rely on some existing PN dictionary (often these 
                   resources are available on the web), its coverage of a domain 
                   corpus may be rather low, in absence of manual updating. In this 
                   paper we propose a technique for the automatic updating of a PN 
                   Dictionary through the cooperation of an inductive and a 
                   probabilistic classifier. In our experiments we show that, 
                   whenever an existing PN Dictionary allows the identification of 
                   50\% of the proper nouns within a corpus, our technique allows, 
                   without additional manual effort, the successful recognition of 
                   about 90\% of the remaining 50\%.},
}
@inProceedings{Peters02,
   author       = {C. Peters and Cornelis H. Koster},
   title        = {Uncertainty-based Noise Reduction and Term selection in Text 
                   Categorization},
   booktitle    = {Proceedings of ECIR-02, 24th European Colloquium on Information 
                   Retrieval Research},
   editor       = {Fabio Crestani and Mark Girolami and Cornelis J. Van Rijsbergen},
   year         = {2002},
   address      = {Glasgow, {UK}},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2291},
   pages        = {248--267},
   url          = {http://link.springer.de/link/service/series/0558/papers/2291/22910248.pdf},
   abstract     = {This paper introduces a new criterium for term selection, which 
                   is based on the notion of Uncertainty. Term selection according 
                   to this criterium is performed by the elimination of noisy terms 
                   on a class-by-class basis, rather than by selecting the most 
                   significant ones. Uncertainty-based term selection (UC) is 
                   compared to a number of other criteria like Information Gain 
                   (IG), simplified chi-square (SX), Term Frequency (TF) and 
                   Document Frequency (DF) in a Text Categorization setting. 
                   Experiments on data sets with different properties 
                   (Reuters-21578, patent abstracts and patent applications) and 
                   with two different algorithms (Winnow and Rocchio) show that 
                   UC-based term selection is not the most aggressive term selection 
                   criterium, but that its effect is quite stable across data sets 
                   and algorithms. This makes it a good candidate for a general 
                   "install-and-forget" term selection mechanism. We also describe 
                   and evaluate a hybrid Term Selection technique, first applying UC 
                   to eliminate noisy terms and then using another criterium to 
                   select the best terms.},
}
@inProceedings{Ragas98,
   author       = {Hein Ragas and Cornelis H. Koster},
   title        = {Four text classification algorithms compared on a {D}utch corpus},
   booktitle    = {Proceedings of SIGIR-98, 21st ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {W. Bruce Croft and Alistair Moffat and Cornelis J. Van Rijsbergen 
                   and Ross Wilkinson and Justin Zobel},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1998},
   address      = {Melbourne, {AU}},
   pages        = {369--370},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/290941/p369-ragas/p369-ragas.pdf},
   abstract     = {We describe an experiment in applying text classification 
                   algorithms to Dutch texts. Four well known learning algorithms: 
                   Rocchio's algorithm (W.W. Cohen and Y. Singer, 1995), the Simple 
                   Bayesian Classifier (SEC) (R.O. Duda and P.E. Hart, 1973), the 
                   Sleeping Experts (SE) and Winnow (I. Dagan et al., 1997) were 
                   implemented. They were tested on a corpus of articles from the 
                   Dutch newspaper NRC, and pre-classified into four categories. The 
                   algorithms are compared on learning speed and error rate. We also 
                   investigated the effect of discarding terms, using either a 
                   dynamic stoplist or the Winnow heuristic.},
}
@inProceedings{Raskutti01,
   author       = {Bhavani Raskutti and Herman Ferr{\'{a}} and Adam Kowalczyk},
   title        = {Second Order Features for Maximising Text Classification 
                   Performance},
   booktitle    = {Proceedings of ECML-01, 12th European Conference on Machine 
                   Learning},
   editor       = {Luc De Raedt and Peter A. Flach},
   year         = {2001},
   url          = {http://link.springer.de/link/service/series/0558/papers/2167/21670454.pdf},
   abstract     = {The paper demonstrates that the addition of automatically 
                   selected word-pairs substantially increases the accuracy of text 
                   classification which is contrary to most previously reported 
                   research. The word-pairs are selected automatically using a 
                   technique based on frequencies of n-grams (sequences of 
                   characters), which takes into account both the frequencies of 
                   word-pairs as well as the context in which they occur. These 
                   improvements are reported for two different classifiers, support 
                   vector machines (SVM) and k-nearest neighbours (kNN), and two 
                   different text corpora. For the first of them, a collection of 
                   articles from PC Week magazine, the addition of word-pairs 
                   increases micro-averaged breakeven accuracy by more than 6\% 
                   point from a baseline accuracy (without pairs) of around 40\%. 
                   For second one, the standard Reuters benchmark, SVM classifier 
                   using augmentation with pairs outperforms all previously reported 
                   results.},
}
@inProceedings{Rau91,
   author       = {Lisa F. Rau and Paul S. Jacobs},
   title        = {Creating segmented databases from free text for text retrieval},
   booktitle    = {Proceedings of SIGIR-91, 14th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Abraham Bookstein and Yves Chiaramella and Gerard Salton and 
                   Vijay V. Raghavan},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Chicago, {US}},
   pages        = {337--346},
   year         = {1991},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/122860/p337-rau/p337-rau.pdf},
   abstract     = {Indexing text for accurate retrieval is a difficult and important 
                   problem. On-line information services generally depend on keyword 
                   indices rather than other methods of retrieval, because of the 
                   practical features of keywords for storage, dissemination, and 
                   browsing as well as for retrieval. However, these methods of 
                   indexing have two major drawbacks: First, they must be 
                   laboriously assigned by human indexers. Second, they are 
                   inaccurate, because of mistakes made by these indexers as well as 
                   the difficulties users have in choosing keywords for their 
                   queries, and the ambiguity a keyword may have. Current natural 
                   language text processing (NLP) methods help to overcome these 
                   problems. Such methods can provide automatic indexing and keyword 
                   assignment capabilities that are at least as accurate as human 
                   indexers in many applications. In addition, NLP systems can 
                   increase the information contained in keyword fields by 
                   separating keywords into segments, or distinct fields that 
                   capture certain discriminating content or relations among 
                   keywords. This paper reports on a system that uses natural 
                   language text processing to derive keywords from free text news 
                   stories, separate these keywords into segments, and automatically 
                   build a segmented database. The system is used as part of a 
                   commercial news clipping and retrieval product. Preliminary 
                   results show improved accuracy, as well as reduced cost, 
                   resulting from these automated techniques.},
}
@inProceedings{Rennie99,
   author       = {Jason Rennie and Andrew Kachites McCallum},
   title        = {Using reinforcement learning to spider the {W}eb efficiently},
   booktitle    = {Proceedings of ICML-99, 16th International Conference on Machine 
                   Learning},
   editor       = {Ivan Bratko and Saso Dzeroski},
   year         = {1999},
   address      = {Bled, {SL}},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   pages        = {335--343},
   url          = {http://www.watson.org/~jrennie/papers/icml99.ps.gz},
   abstract     = {Consider the task of exploring the Web in order to find pages of 
                   a particular kind or on a particular topic. This task arises in 
                   the construction of search engines and Web knowledge bases. The 
                   paper argues that the creation of efficient Web spiders is best 
                   framed and solved by reinforcement learning, a branch of machine 
                   learning that concerns itself with optimal sequential decision 
                   making. One strength of reinforcement learning is that it 
                   provides a formalism for measuring the utility of actions that 
                   give benefit only in the future. We present an algorithm for 
                   learning a value function that maps hyperlinks to future 
                   discounted reward using a naive Bayes text classifier. 
                   Experiments on two real-world spidering tasks show a three-fold 
                   improvement in spidering efficiency over traditional 
                   breadth-first search, and up to a two-fold improvement over 
                   reinforcement learning with immediate reward only.},
}
@inProceedings{Rennie03,
   author       = {Jason Rennie and Lawrence Shih and Jaime Teevan and David Karger},
   title        = {Tackling the Poor Assumptions of Naive Bayes Text Classifiers},
   booktitle    = {Proceedings of ICML-03, 20th International Conference on Machine 
                   Learning},
   editor       = {},
   year         = {2003},
   address      = {Washington, {DC}},
   pages        = {},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {},
   abstract     = {},
}
@article{RibeiroNeto01,
   author       = {Berthier Ribeiro-Neto and Alberto H.F. Laender and Luciano R. {De 
                   Lima}},
   title        = {An Experimental Study in Automatically Categorizing Medical 
                   Documents},
   journal      = {Journal of the American Society for Information Science and 
                   Technology},
   year         = {2001},
   number       = {5},
   pages        = {391--401},
   volume       = {52},
   url          = {http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76511157&PLACEBO=IE.pdf},
   abstract     = {In this article, we evaluate the retrieval performance of an 
                   algorithm that automatically categorizes medical documents. The 
                   categorization, which consists in assigning an International Code 
                   of Disease (ICD) to the medical document under examination, is 
                   based on well-known information retrieval techniques. The 
                   algorithm, which we proposed, operates in a fully automatic mode 
                   and requires no supervision or training data. Using a database of 
                   20,569 documents, we verify that the algorithm attains levels of 
                   average precision in the 70-80\% range for category coding and in 
                   the 60-70\% range for subcategory coding. We also carefully 
                   analyze the case of those documents whose categorization is not 
                   in accordance with the one provided by the human specialists. The 
                   vast majority of them represent cases that can only be fully 
                   categorized with the assistance of a human subject (because, for 
                   instance, they require specific knowledge of a given pathology). 
                   For a slim fraction of all documents (0.77\% for category coding 
                   and 1.4\% for subcategory coding), the algorithm makes 
                   assignments that are clearly incorrect. However, this fraction 
                   corresponds to only one-fourth of the mistakes made by the human 
                   specialists.},
}
@inProceedings{Riloff93,
   author       = {Ellen Riloff},
   title        = {Using Cases to Represent Context for Text Classification},
   booktitle    = {Proceedings of CIKM-93, 2nd International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {Bharat Bhargava and Timothy Finin and Yelena Yesha},
   year         = {1993},
   address      = {New York, {US}},
   pages        = {105--113},
   url          = {http://www.cs.utah.edu/~riloff/psfiles/cikm93-w-addend.ps},
   abstract     = {Research on text classification has typically focused on keyword 
                   searches and statistical techniques. Keywords alone cannot always 
                   distinguish the relevant from the irrelevant texts and some 
                   relevant texts do not contain any reliable keywords at all. Our 
                   approach to text classifkation uses case-based reasoning to 
                   represent natural language contexts that can be used to classify 
                   texts with extremely high precision. The case base of natural 
                   language contexts is acquired automatically during sentence 
                   analysis using a training corpus of texts and their correct 
                   relevancy classifications. A text is represented as a set of 
                   cases and we classify a text as relevant if any of its cases is 
                   deemed to be relevant. We rely on the statistical properties of 
                   the case base to determine whether similar cases are highly 
                   correlated with relevance for the domain. Experiments with the 
                   MUC corpus suggest that case-based text classification can 
                   achieve very high levels of precision and outperforms our 
                   previous algorithms based on relevancy signatures.},
}
@phdThesis{Riloff94a,
   author       = {Ellen Riloff},
   title        = {Information Extraction as a Basis for Portable Text 
                   Classification Systems},
   school       = {Department of Computer Science, University of Massachusetts},
   address      = {Amherst, {US}},
   year         = {1994},
   url          = {http://www.cs.utah.edu/~riloff/psfiles/single-thesis.ps},
   abstract     = {Knowledge-based natural language processing systems have achieved 
                   good success with many tasks, but they often require many 
                   person-months of effort to build an appropriate knowledge base. 
                   As a result, they are not portable across domains. This 
                   knowledge-engineering bottleneck must be addressed before 
                   knowledge-based systems will be practical for real-world 
                   applications. This dissertation addresses the 
                   knowledge-engineering bottleneck for a natural language 
                   processing task called ``information extraction''. A system 
                   called AutoSlog is presented which automatically constructs 
                   dictionaries for information extraction, given an appropriate 
                   training corpus. In the domain of terrorism, AutoSlog created a 
                   dictionary using a training corpus and five person-hours of 
                   effort that achieved 98\% of the performance of a hand-crafted 
                   dictionary that took approximately 1500 person-hours to build. 
                   This dissertation also describes three algorithms that use 
                   information extraction to support high-precision text 
                   classification. As more information becomes available on-line, 
                   intelligent information retrieval will be crucial in order to 
                   navigate the information highway efficiently and effectively. The 
                   approach presented here represents a compromise between 
                   keyword-based techniques and in-depth natural language 
                   processing. The text classification algorithms classify texts 
                   with high accuracy by using an underlying information extraction 
                   system to represent linguistic phrases and contexts. Experiments 
                   in the terrorism domain suggest that increasing the amount of 
                   linguistic context can improve performance. Both AutoSlog and the 
                   text classification algorithms are evaluated in three domains: 
                   terrorism, joint ventures, and microelectronics. An important 
                   aspect of this dissertation is that AutoSlog and the text 
                   classification systems can be easily ported across domains.},
}
@article{Riloff94,
   author       = {Ellen Riloff and Wendy Lehnert},
   title        = {Information extraction as a basis for high-precision text 
                   classification},
   journal      = {{ACM} Transactions on Information Systems},
   year         = {1994},
   number       = {3},
   volume       = {12},
   pages        = {296--333},
   url          = {http://www.cs.utah.edu/~riloff/psfiles/single-acm.ps},
   abstract     = {We describe an approach to text classification that represents a 
                   compromise between traditional word-based techniques and in-depth 
                   natural language processing. Our approach uses a natural language 
                   processing task called information extraction as a basis for 
                   high-precision text classification. We present three algorithms 
                   that use varying amounts of extracted information to classify 
                   texts. The relevancy signatures algorithm uses linguistic 
                   phrases, the augmented relevancy signatures algorithm uses 
                   phrases and local context, and the case-based text classification 
                   algorithm uses larger pieces of context. Relevant phrases and 
                   contexts are acquired automatically using a training corpus. We 
                   evaluate the algorithms on the basis of two test sets from the 
                   MUC-4 corpus. All three algorithms achieved high precision on 
                   both test sets, with the augmented relevancy signatures algorithm 
                   and the case-based algorithm reaching 100\% precision with over 
                   60\% recall on one set. In addition, we compare the algorithms on 
                   a larger collection of 1700 texts and describe an automated 
                   method for empirically deriving appropriate threshold values. The 
                   results suggest that information extraction techniques can 
                   support high-precision text classification and, in general, using 
                   more extracted information improves performance. As a practical 
                   matter, we also explain how the text classification system can be 
                   easily ported across domains.},
}
@inProceedings{Riloff95,
   author       = {Ellen Riloff},
   title        = {Little Words Can Make a Big Difference for Text Classification},
   booktitle    = {Proceedings of SIGIR-95, 18th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Edward A. Fox and Peter Ingwersen and Raya Fidel},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1995},
   address      = {Seattle, {US}},
   pages        = {130--136},
   url          = {http://www.cs.utah.edu/~riloff/psfiles/sigir95.ps},
   abstract     = {Most information retrieval systems use stopword lists and 
                   stemming algorithms. However, we have found that recognizing 
                   singular and plural nouns, verb forms, negation, and prepositions 
                   can produce dramatically different text classification results. 
                   We present results from text classification experiments that 
                   compare relevancy signatures, which use local linguistic context, 
                   with corresponding indexing terms that do not. In two different 
                   domains, relevancy signatures produced better results than the 
                   simple indexing terms. These experiments suggest that stopword 
                   lists and stemming algorithms may remove or conflate many words 
                   that could be used to create more effective indexing terms.},
}
@inProceedings{Riloff96,
   author       = {Ellen Riloff},
   title        = {Using Learned Extraction Patterns for Text Classification},
   booktitle    = {Connectionist, statistical, and symbolic approaches to learning 
                   for natural language processing},
   editor       = {Stefan Wermter and Ellen Riloff and Gabriele Scheler},
   pages        = {275--289},
   year         = {1996},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1040},
   url          = {http://www.cs.utah.edu/~riloff/psfiles/ijcai-book-chapter.ps},
   abstract     = {A major knowledge-engineering bottleneck for information 
                   extraction systems is the process of constructing an appropriate 
                   dictionary of extraction patterns. AutoSlog is a dictionary 
                   construction system that has been shown to substantially reduce 
                   the time required for knowledge engineering by learning 
                   extraction patterns automatically. However, an open question was 
                   whether these extraction patterns were useful for tasks other 
                   than information extraction. The author describes a series of 
                   experiments that show how the extraction patterns learned by 
                   AutoSlog can be used for text classification. Three dictionaries 
                   produced by AutoSlog for different domains performed well in the 
                   author`s text classification experiments.},
}
@inProceedings{Riloff92,
   author       = {Ellen Riloff and Wendy Lehnert},
   title        = {Classifying Texts Using Relevancy Signatures},
   booktitle    = {Proceedings of AAAI-92, 10th Conference of the American 
                   Association for Artificial Intelligence},
   publisher    = {{AAAI} Press, Menlo Park, {US}},
   editor       = {},
   year         = {1998},
   pages        = {329--334},
   address      = {San Jose, {US}},
   url          = {},
   abstract     = {},
}
@inCollection{Riloff99,
   author       = {Ellen Riloff and Jeffrey Lorenzen},
   title        = {Extraction-based Text Categorization: Generating Domain-specific 
                   Role Relationships},
   booktitle    = {Natural language information retrieval},
   editor       = {Tomek Strzalkowski},
   year         = {1999},
   pages        = {167--196},
   publisher    = {Kluwer Academic Publishers},
   address      = {Dordrecht, {NL}},
   url          = {http://www.cs.utah.edu/~riloff/psfiles/nlp-ir-chapter.ps},
   abstract     = {In previous work, we developed several algorithms that use 
                   information extraction techniques to achieve high-precision text 
                   categorization. The relevancy signatures algorithm classifies 
                   texts using extraction patterns, and the augmented relevancy 
                   signatures algorithm classifies texts using extraction patterns 
                   and semantic features associated with role fillers (Riloff and 
                   Lehnert, 1994). These algorithms relied on hand-coded training 
                   data, including annotated texts and a semantic dictionary. In 
                   this chapter, we describe two advances that significantly improve 
                   the practicality of our approach. First, we explain how the 
                   extraction patterns can be generated automatically using only 
                   preclassified texts as input. Second, we present the 
                   word-augmented relevancy signatures algorithm that uses lexical 
                   items to represent domain-specific role relationships instead of 
                   semantic features. Using these techniques, we can automatically 
                   build text categorization systems that benefit from 
                   domain-specific natural language processing.},
}
@article{Robertson84,
   author       = {Stephen E. Robertson and P. Harding},
   title        = {Probabilistic automatic indexing by learning from human indexers},
   year         = {1984},
   journal      = {Journal of Documentation},
   volume       = {40},
   number       = {4},
   pages        = {264--270},
   url          = {},
   abstract     = {},
}
@inProceedings{Roth98,
   author       = {Dan Roth},
   title        = {Learning to resolve natural language ambiguities: a unified 
                   approach},
   booktitle    = {Proceedings of AAAI-98, 15th Conference of the American 
                   Association for Artificial Intelligence},
   publisher    = {{AAAI} Press, Menlo Park, {US}},
   editor       = {},
   year         = {1998},
   pages        = {806--813},
   address      = {Madison, {US}},
   url          = {http://l2r.cs.uiuc.edu/~danr/Papers/aaai98.ps.gz},
   abstract     = {We analyze a few of the commonly used statistics based and 
                   machine learning algorithms for natural language disambiguation 
                   tasks and observe that they can be recast as learning linear 
                   separators in the feature space. Each of the methods makes a 
                   priori assumptions, which it employs, given the data, when 
                   searching for its hypothesis. Nevertheless, as we show, it 
                   searches a space that is as rich as the space of all linear 
                   separators. We use this to build an argument for a data driven 
                   approach which merely searches for a good linear separator in the 
                   feature space, without further assumptions on the domain or a 
                   specific problem. We present such an approach - a sparse network 
                   of linear separators, utilizing the Winnow learning algorithm - 
                   and show how to use it in a variety of ambiguity resolution 
                   problems. The learning approach presented is attribute-efficient 
                   and, therefore, appropriate for domains having very large number 
                   of attributes. In particular, we present an extensive 
                   experimental comparison of our approach with other methods on 
                   several well studied lexical disambiguation tasks such as 
                   context-sensitive spelling correction, prepositional phrase 
                   attachment and part of speech tagging. In all cases we show that 
                   our approach either outperforms other methods tried for these 
                   tasks or performs comparably to the best.},
}
@inProceedings{Ruiz97,
   author       = {Miguel E. Ruiz and Padmini Srinivasan},
   title        = {Automatic Text Categorization Using Neural Networks},
   booktitle    = {Proceedings of the 8th ASIS/SIGCR Workshop on Classification 
                   Research},
   editor       = {Efthimis Efthimiadis},
   publisher    = {American Society for Information Science, Washington, {US}},
   year         = {1997},
   address      = {Washington, {US}},
   pages        = {59--72},
   url          = {http://www.cs.uiowa.edu/~mruiz/papers/sigcr97/sigcrfinal2.html},
   abstract     = {This paper presents the results obtained from a series of 
                   experiments in automatic text categorization of MEDLINE articles. 
                   The main goal of this research is to build neural networks and to 
                   train them in assigning MeSH phrases based on term frequency of 
                   single words from TITLE and abstract. The experiments compare the 
                   performance of a counterpropagation network against a 
                   backpropagation neural network. Results obtained by using a set 
                   of 2,344 MEDLINE documents are presented and discussed.},
}
@inProceedings{Ruiz99,
   author       = {Miguel E. Ruiz and Padmini Srinivasan},
   title        = {Hierarchical neural networks for text categorization},
   booktitle    = {Proceedings of SIGIR-99, 22nd ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Marti A. Hearst and Fredric Gey and Richard Tong},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Berkeley, {US}},
   year         = {1999},
   pages        = {281--282},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/312624/p281-ruiz/p281-ruiz.pdf},
   abstract     = {This paper presents the design and evaluation of a text 
                   categorization method based on the Hierarchical Mixture of 
                   Experts model. This model uses a divide and conquer principle to 
                   define smaller categorization problems based on a predefined 
                   hierarchical structure. The final classifier is a hierarchical 
                   array of neural networks. The method is evaluated using the UMLS 
                   Metathesaurus as the underlying hierarchical structure, and the 
                   OHSUMED test set of MEDLINE records. Comparisons with traditional 
                   Rocchio¹s algorithm adapted for text categorization, as well as 
                   flat neural network classifiers are provided. The results show 
                   that the use of the hierarchical structure improves text 
                   categorization performance significantly.},
}
@inProceedings{Ruiz99a,
   author       = {Miguel E. Ruiz and Padmini Srinivasan},
   title        = {Combining Machine Learning and Hierarchical Indexing Structures 
                   for Text Categorization},
   booktitle    = {Proceedings of the 10th ASIS/SIGCR Workshop on Classification 
                   Research},
   editor       = {},
   publisher    = {American Society for Information Science, Washington, {US}},
   year         = {1999},
   address      = {Washington, {US}},
   pages        = {},
   url          = {http://www.cs.uiowa.edu/~mruiz/papers/sigcr_10},
   abstract     = {This paper presents a method that exploits the hierarchical 
                   structure of an indexing vocabulary to guide the development and 
                   training of machine learning methods for automatic text 
                   categorization. We present the design of a hierarchical 
                   classifier based on the divide and conquer principle. The method 
                   is evaluated using backpropagation neural networks, as the 
                   machine learning algorithm, that learn to assign MeSH categories 
                   to a subset of MEDLINE records. Comparisons with traditional 
                   Rocchio¹s algorithm adapted for text categorization, as well as 
                   flat neural network classifiers are provided. The results 
                   indicate that the use of hierarchical structures improves 
                   performance significantly.},
}
@article{Ruiz02,
   author       = {Miguel Ruiz and Padmini Srinivasan},
   title        = {Hierarchical text classification using neural networks},
   journal      = {Information Retrieval},
   number       = {1},
   volume       = {5},
   pages        = {87--118},
   year         = {2002},
   url          = {http://www.wkap.nl/article.pdf?383232},
   abstract     = {This paper presents the design and evaluation of a text 
                   categorization method based on the Hierarchical Mixture of 
                   Experts model. This model uses a divide and conquer principle to 
                   define smaller categorization problems based on a predefined 
                   hierarchical structure. The final classifier is a hierarchical 
                   array of neural networks. The method is evaluated using the UMLS 
                   Metathesaurus as the underlying hierarchical structure, and the 
                   OHSUMED test set of MEDLINE records. Comparisons with an 
                   optimized version of the traditional Rocchio¹s algorithm adapted 
                   for text categorization, as well as flat neural network 
                   classifiers are provided. The results show that the use of the 
                   hierarchical structure improves text categorization performance 
                   with respect to an equivalent flat model. The optimized Rocchio 
                   algorithm achieves a performance comparable with that of the 
                   hierarchical neural networks.},
}
@inProceedings{Sable99,
   author       = {Carl L. Sable and Vasileios Hatzivassiloglou},
   title        = {Text-based approaches for the categorization of images},
   booktitle    = {Proceedings of ECDL-99, 3rd European Conference on Research and 
                   Advanced Technology for Digital Libraries},
   editor       = {Serge Abiteboul and Anne-Marie Vercoustre},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1696. An extended version appears as~\cite{Sable00}},
   year         = {1999},
   address      = {Paris, {FR}},
   pages        = {19--38},
   url          = {http://www.cs.columbia.edu/~sable/research/ecdl99.ps},
   abstract     = {The rapid expansion of multimedia digital collections brings to 
                   the fore the need for classifying not only text documents but 
                   their embedded non-textual parts as well. We propose a model for 
                   basing classification of multimedia on broad, non-topical 
                   features, and show how information on targeted nearby pieces of 
                   text can be used to effectively classify photographs on a first 
                   such feature, distinguishing between indoor and outdoor images. 
                   We examine several variations to a TF*IDF-based approach for this 
                   task, empirically analyze their effects, and evaluate our system 
                   on a large collection of images from current news newsgroups. In 
                   addition, we investigate alternative classification and 
                   evaluation methods, and the effect that a secondary feature can 
                   have on indoor/outdoor classification. We obtain a classification 
                   accuracy of 82\%, a number that clearly outperforms baseline 
                   estimates and competing image-based approaches and nears the 
                   accuracy of humans who perform the same task with access to 
                   comparable information.},
}
@article{Sable00,
   author       = {Carl L. Sable and Vasileios Hatzivassiloglou},
   title        = {Text-based approaches for non-topical image categorization},
   journal      = {International Journal of Digital Libraries},
   year         = {2000},
   number       = {3},
   volume       = {3},
   pages        = {261--275},
   url          = {http://www.cs.columbia.edu/~sable/research/ijodl00.pdf},
   abstract     = {The rapid expansion of multimedia digital collections brings to 
                   the fore the need for classifying not only text documents but 
                   their embedded non-textual parts as well. We propose a model for 
                   basing classification of multimedia on broad, non-topical 
                   features, and show how information on targeted nearby pieces of 
                   text can be used to effectively classify photographs on a first 
                   such feature, distinguishing between indoor and outdoor images. 
                   We examine several variations to a TF*IDF-based approach for this 
                   task, empirically analyze their effects, and evaluate our system 
                   on a large collection of images from current news newsgroups. In 
                   addition, we investigate alternative classification and 
                   evaluation methods, and the effects that secondary features have 
                   on indoor/outdoor classification. Using density estimation over 
                   the raw TF*IDF values, we obtain a classification accuracy of 
                   82\%, a number that outperforms baseline estimates and earlier, 
                   image-based approaches, at least in the domain of news articles, 
                   and that nears the accuracy of humans who perform the same task 
                   with access to comparable information.},
}
@inProceedings{Sable01,
   author       = {Carl Sable and Ken Church},
   title        = {Using Bins to Empirically Estimate Term Weights for Text 
                   Categorization},
   booktitle    = {Proceedings of EMNLP-01, 6th Conference on Empirical Methods in 
                   Natural Language Processing},
   year         = {2001},
   publisher    = {Association for Computational Linguistics, Morristown, {US}},
   editor       = {Lillian Lee and Donna Harman},
   pages        = {58--66},
   address      = {Pittsburgh, {US}},
   url          = {http://www.cs.columbia.edu/~sable/research/emnlp01.ps},
   abstract     = {This paper introduces a term weighting method for text 
                   categorization based on smoothing ideas borrowed from speech 
                   recognition. Empirical estimates of weights (likelihood ratios) 
                   become unstable when counts are small. Instead of estimating 
                   weights for individual words, as Naive Bayes does, words with 
                   similar features are grouped into bins, and a single weight is 
                   estimated for each bin. This weight is then assigned to all of 
                   the words in the bin. The bin-based method is intended for tasks 
                   where there is insufficient training data to estimate a separate 
                   weight for each word. Experiments show the bin-based method is 
                   highly competitive with other current methods. In particular, 
                   this method is most similar to Naive Bayes; it generally performs 
                   at least as well as Naive Bayes, and sometimes better.},
}
@inProceedings{Sahami96,
   author       = {Mehran Sahami and Marti A. Hearst and Eric Saund},
   title        = {Applying the Multiple Cause Mixture Model to Text Categorization},
   booktitle    = {Proceedings of ICML-96, 13th International Conference on Machine 
                   Learning},
   editor       = {Lorenza Saitta},
   year         = {1996},
   address      = {Bari, {IT}},
   pages        = {435--443},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://robotics.stanford.edu/users/sahami/papers-dir/ml96-mcmm.ps},
   abstract     = {The paper introduces the use of the Multiple Cause Mixture Model 
                   for automatic text category assignment. Although much research 
                   has been done on text categorization, this algorithm is novel in 
                   that it is unsupervised, i.e., it does not require pre-labeled 
                   training examples, and it can assign multiple category labels to 
                   documents. We present very preliminary results of the application 
                   of this model to a standard test collection, evaluating it in 
                   supervised mode in order to facilitate comparison with other 
                   methods, and showing initial results of its use in unsupervised 
                   mode.},
}
@proceedings{Sahami98a,
   editor       = {Mehran Sahami},
   title        = {Proceedings of the 1998 Workshop on Learning for Text 
                   Categorization},
   institution  = {Americal Association for Artificial Intelligence},
   note         = {Available as Technical Report WS-98-05},
   address      = {Madison, {US}},
   year         = {1998},
   url          = {},
}
@inProceedings{Sahami98b,
   author       = {Mehran Sahami and Salim Yusufali and Michelle Q. Baldonado},
   title        = {{SONIA}: a service for organizing networked information 
                   autonomously},
   booktitle    = {Proceedings of DL-98, 3rd ACM Conference on Digital Libraries},
   editor       = {Ian Witten and Rob Akscyn and Frank M. Shipman},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1998},
   address      = {Pittsburgh, {US}},
   pages        = {200--209},
   url          = {http://robotics.stanford.edu/users/sahami/papers-dir/dl98-sonia.ps},
   abstract     = {The recent explosion of online information in digital libraries 
                   and on the World Wide Web has given rise to a number of 
                   query-based search engines and manually constructed topical 
                   hierarchies. However, these tools are quickly becoming inadequate 
                   as query results grow incomprehensibly large and manual 
                   classification in topic hierarchies creates an immense 
                   bottleneck. We address these problems with a system for topical 
                   information space navigation that combines the query-based and 
                   taxonomic systems. We employ machine learning techniques to 
                   create dynamic document categorizations based on the full-text of 
                   articles that are retrieved in response to users' queries. Our 
                   system, named SONIA (Service for Organizing Networked Information 
                   Autonomously), has been implemented as part of the Stanford 
                   Digital Libraries Testbed. It employs a combination of 
                   technologies that takes the results of queries to networked 
                   information sources and, in real-time, automatically retrieve, 
                   parse and organize these documents into coherent categories for 
                   presentation to the user. Moreover, the system can then save such 
                   document organizations in user profiles which can then be used to 
                   help classify future query results by the same user. SONIA uses a 
                   multi-tier approach to extracting relevant terms from documents 
                   as well as statistical clustering methods to determine potential 
                   topics within a document collection. It also makes use of 
                   Bayesian classification techniques to classify new documents 
                   within an existing categorization scheme. In this way, it allows 
                   users to navigate the results of a query at a more topical level 
                   rather than having to examine each document text separately.},
}
@article{Sakakibara96,
   author       = {Yasubumi Sakakibara and Kazuo Misue and Takeshi Koshiba},
   title        = {A machine learning approach to knowledge acquisitions from text 
                   databases},
   year         = {1996},
   journal      = {International Journal of Human Computer Interaction},
   volume       = {8},
   number       = {3},
   pages        = {309--324},
   url          = {},
   abstract     = {The rapid growth of data in large databases, such as text 
                   databases and scientific databases, requires efficient computer 
                   methods for automating analyses of the data with the goal of 
                   acquiring knowledges or making discoveries. Because the analyses 
                   of data are generally so expensive, most parts in databases 
                   remains as raw, unanalyzed primary data. Technology from machine 
                   learning (ML) will offer efficient tools for the intelligent 
                   analyses of the data using generalization ability. Generalization 
                   is an important ability specific to inductive learning that will 
                   predict unseen data with high accuracy based on learned concepts 
                   from training examples. In this article, we apply ML to 
                   text-database analyses and knowledge acquisitions from text 
                   databases. We propose a completely new approach to the problem of 
                   text classification and extracting keywords by using ML 
                   techniques. We introduce a class of representations for 
                   classifying text data based on decision trees; (i.e., decision 
                   trees over attributes on strings) and present an algorithm for 
                   learning them inductively. Our algorithm has the following 
                   features: It does not need any natural language processing 
                   technique and it is robust for noisy data. We show that our 
                   learning algorithm can be used for automatic extraction of 
                   keywords for text retrieval and automatic text categorization. We 
                   also demonstrate some experimental results using our algorithm on 
                   the problem of classifying bibliographic data and extracting 
                   keywords in order to show the effectiveness of our approach.},
}
@inProceedings{Sakkis01,
   author       = {Georgios Sakkis and Ion Androutsopoulos and Georgios Paliouras 
                   and Vangelis Karkaletsis and Constantine D. Spyropoulos and 
                   Panagiotis Stamatopoulos},
   title        = {Stacking Classifiers for Anti-Spam Filtering of E-Mail},
   booktitle    = {Proceedings of EMNLP-01, 6th Conference on Empirical Methods in 
                   Natural Language Processing},
   year         = {2001},
   publisher    = {Association for Computational Linguistics, Morristown, {US}},
   editor       = {Lillian Lee and Donna Harman},
   pages        = {44--50},
   address      = {Pittsburgh, {US}},
   url          = {http://arXiv.org/pdf/cs/0106040},
   abstract     = {We evaluate empirically a scheme for combining classifiers, known 
                   as stacked generalization, in the context of anti-spam filtering, 
                   a novel cost-sensitive application of text categorization. 
                   Unsolicited commercial e-mail, or "spam", floods mailboxes, 
                   causing frustration, wasting bandwidth, and exposing minors to 
                   unsuitable content. Using a public corpus, we show that stacking 
                   can improve the efficiency of automatically induced anti-spam 
                   filters, and that such filters can be used in real-life 
                   applications.},
}
@article{Sakkis03,
   author       = {Georgios Sakkis and Ion Androutsopoulos and Georgios Paliouras 
                   and Vangelis Karkaletsis and Constantine D. Spyropoulos and 
                   Panagiotis Stamatopoulos},
   title        = {A Memory-Based Approach to Anti-Spam Filtering for Mailing Lists},
   journal      = {Information Retrieval},
   publisher    = {Kluwer Academic Publishers},
   issn         = {1386-4564},
   number       = {1},
   volume       = {6},
   pages        = {49--73},
   year         = {2003},
   url          = {http://www.kluweronline.com/issn/1386-4564},
   abstract     = {This paper presents an extensive empirical evaluation of 
                   memory-based learning in the context of anti-spam filtering, a 
                   novel cost-sensitive application of text categorization that 
                   attempts to identify automatically unsolicited commercial 
                   messages that flood mailboxes. Focusing on anti-spam filtering 
                   for mailing lists, a thorough investigation of the effectiveness 
                   of a memory-based anti-spam filter is performed using a publicly 
                   available corpus. The investigation includes different attribute 
                   and distance-weighting schemes, and studies on the effect of the 
                   neighborhood size, the size of the attribute set, and the size of 
                   the training corpus. Three different cost scenarios are 
                   identified, and suitable cost-sensitive evaluation functions are 
                   employed. We conclude that memory-based anti-spam filtering for 
                   mailing lists is practically feasible, especially when combined 
                   with additional safety nets. Compared to a previously tested 
                   Naive Bayes filter, the memory-based filter performs on average 
                   better, particularly when the misclassification cost for non-spam 
                   messages is high.},
}
@inProceedings{Sasaki98,
   author       = {Minoru Sasaki and Kenji Kita},
   title        = {Automatic text categorization based on hierarchical rules},
   booktitle    = {Proceedings of the 5th International Conference on Soft Computing 
                   and Information},
   publisher    = {World Scientific, Singapore, {SN}},
   address      = {Iizuka, {JP}},
   year         = {1998},
   pages        = {935--938},
   url          = {http://www-a2k.is.tokushima-u.ac.jp/member/sasaki/frame_home/Papers/IIZUKA98.ps},
   abstract     = {Document categorization, which is defined as the classification 
                   of text documents into one of several fixed classes or 
                   categories, has become important with the explosive growth of the 
                   World Wide Web. The goal of the work described in this paper is 
                   to automatically categorize Web documents in order to enable 
                   effective retrieval of Web information. In this paper, based on 
                   the rule learning algorithm RIPPER (Repeated Incremental Pruning 
                   to Produce Error Reduction), we propose an efficient method for 
                   hierarchical document categorization.},
}
@inProceedings{Sasaki98a,
   author       = {Minoru Sasaki and Kenji Kita},
   title        = {Rule-based text categorization using hierarchical categories},
   booktitle    = {Proceedings of SMC-98, IEEE International Conference on Systems, 
                   Man, and Cybernetics},
   editor       = {},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   address      = {La Jolla, {US}},
   year         = {1998},
   pages        = {2827--2830},
   url          = {http://www-a2k.is.tokushima-u.ac.jp/member/sasaki/frame_home/Papers/SMC.ps},
   abstract     = {Document categorization, which is defined as the classification 
                   of text documents into one of several fixed classes or 
                   categories, has become important with the explosive growth of the 
                   World Wide Web. The goal of the work described here is to 
                   automatically categorize Web documents in order to enable 
                   effective retrieval of Web information. In this paper, based on 
                   the rule learning algorithm RIPPER (for Repeated Incremental 
                   Pruning to Produce Error Reduction), we propose an efficient 
                   method for hierarchical document categorization.},
}
@inProceedings{Schapire98,
   author       = {Schapire, Robert E. and Singer, Yoram and Singhal, Amit},
   title        = {Boosting and {R}occhio applied to text filtering},
   booktitle    = {Proceedings of SIGIR-98, 21st ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {W. Bruce Croft and Alistair Moffat and Cornelis J. Van Rijsbergen 
                   and Ross Wilkinson and Justin Zobel},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1998},
   address      = {Melbourne, {AU}},
   pages        = {215--223},
   url          = {http://www.research.att.com/~schapire/cgi-bin/uncompress-papers/SchapireSiSi98.ps},
   abstract     = {We discuss two learning algorithms for text filtering: modified 
                   Rocchio and a boosting algorithm called AdaBoost. We show how 
                   both algorithms can be adapted to maximize any general utility 
                   matrix that associates cost (or gain) for each pair of machine 
                   prediction and correct label. We first show that AdaBoost 
                   significantly outperforms another highly effective text filtering 
                   algorithm. We then compare AdaBoost and Rocchio over three large 
                   text filtering tasks. Overall both algorithms are comparable and 
                   are quite effective. AdaBoost produces better classifiers than 
                   Rocchio when the training collection contains a very large number 
                   of relevant documents. However, on these tasks, Rocchio runs much 
                   faster than AdaBoost.},
}
@article{Schapire00,
   author       = {Schapire, Robert E. and Singer, Yoram},
   title        = {{{\sc BoosTexter}}: a boosting-based system for text 
                   categorization},
   journal      = {Machine Learning},
   year         = {2000},
   number       = {2/3},
   volume       = {39},
   pages        = {135-168},
   url          = {http://www.research.att.com/~schapire/papers/SchapireSi98b.ps.Z},
   abstract     = {This work focuses on algorithms which learn from examples to 
                   perform multiclass text and speech categorization tasks. Our 
                   approach is based on a new and improved family of boosting 
                   algorithms. We describe in detail an implementation, called 
                   BoosTexter, of the new boosting algorithms for text 
                   categorization tasks. We present results comparing the 
                   performance of BoosTexter and a number of other 
                   text-categorization algorithms on a variety of tasks. We conclude 
                   by describing the application of our system to automatic 
                   call-type identification from unconstrained spoken customer 
                   responses.},
}
@inProceedings{Scheffer99,
   author       = {Tobias Scheffer and Thorsten Joachims},
   title        = {Expected error analysis for model selection},
   booktitle    = {Proceedings of ICML-99, 16th International Conference on Machine 
                   Learning},
   editor       = {Ivan Bratko and Saso Dzeroski},
   year         = {1999},
   address      = {Bled, {SL}},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   pages        = {361-370},
   url          = {http://www-ai.cs.uni-magdeburg.de/~scheffer/papers/icml99.ps},
   abstract     = {In order to select a good hypothesis language (or model) from a 
                   collection of possible models, one has to assess the 
                   generalization performance of the hypothesis which is returned by 
                   a learner that is bound to use that model. The paper deals with a 
                   new and very efficient way of assessing this generalization 
                   performance. We present an analysis which characterizes the 
                   expected generalization error of the hypothesis with least 
                   training error in terms of the distribution of error rates of the 
                   hypotheses in the model. This distribution can be estimated very 
                   efficiently from the data which immediately leads to an efficient 
                   model selection algorithm. The analysis predicts learning curves 
                   with a very high precision and thus contributes to a better 
                   understanding of why and when over-fitting occurs. We present 
                   empirical studies (controlled experiments on Boolean decision 
                   trees and a large-scale text categorization problem) which show 
                   that the model selection algorithm leads to error rates which are 
                   often as low as those obtained by 10-fold cross validation 
                   (sometimes even lower). However, the algorithm is much more 
                   efficient (because the learner does not have to be invoiced at 
                   all) and thus solves model selection problems with as many as a 
                   thousand relevant attributes and 12000 examples.},
}
@inProceedings{Schutze95,
   author       = {Hinrich Sch{\"{u}}tze and David A. Hull and Jan O. Pedersen},
   title        = {A comparison of classifiers and document representations for the 
                   routing problem},
   booktitle    = {Proceedings of SIGIR-95, 18th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Edward A. Fox and Peter Ingwersen and Raya Fidel},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1995},
   address      = {Seattle, {US}},
   pages        = {229--237},
   url          = {ftp://parcftp.xerox.com/pub/qca/papers/sigir95.ps.gz},
   abstract     = {In this paper, we compare learning techniques based on 
                   statistical classification to traditional methods of relevance 
                   feedback for the document routing problem. We consider three 
                   classification techniques which have decision rules that are 
                   derived via explicit error minimization: linear discriminant 
                   analysis, logistic regression, and neural networks. We 
                   demonstrate that the classifiers perform 10-15\% better than 
                   relevance feedback via Rocchio expansion for the TREC-2 and 
                   TREC-3 routing tasks. Error minimization is difficult in 
                   high-dimensional feature spaces because the convergence process 
                   is slow and the models are prone to overfitting. We use two 
                   different strategies, latent semantic indexing and optimal term 
                   selection, to reduce the number of features. Our results indicate 
                   that features based on latent semantic indexing are more 
                   effective for techniques such as linear discriminant analysis and 
                   logistic regression, which have no way to protect against 
                   overfitting. Neural networks perform equally well with either set 
                   of features and can take advantage of the additional information 
                   available when both feature sets are used as input.},
}
@article{Schutze98,
   author       = {Hinrich Sch{\"{u}}tze},
   title        = {Automatic word sense discrimination},
   journal      = {Computational Linguistics},
   year         = {1998},
   volume       = {24},
   number       = {1},
   pages        = {97--124},
   url          = {},
   abstract     = {This paper presents context-group discrimination, a 
                   disambiguation algorithm based on clustering. Senses are 
                   interpreted as groups (or clusters) of similar contexts of the 
                   ambiguous word. Words, contexts and senses are represented in 
                   Word Space, a high-dimensional real-valued space in which 
                   closeness corresponds to semantic similarity. Similarity in Word 
                   Space is based on second-order co-occurrence: two tokens (or 
                   contexts) of the ambiguous word are assigned to the same sense 
                   cluster if the words they co-occur with in turn occur with 
                   similar words in a training corpus. The algorithm is automatic 
                   and unsupervised in both training and application: senses are 
                   induced from a corpus without labeled training instances or other 
                   external knowledge sources. The paper demonstrates good 
                   performance of context-group discrimination for a sample of 
                   natural and artificial ambiguous words.},
}
@mastersThesis{Scott98,
   author       = {Sam Scott},
   title        = {Feature Engineering for a Symbolic Approach to Text 
                   Classification},
   school       = {Computer Science Department, University of Ottawa},
   address      = {Ottawa, {CA}},
   year         = {1998},
   url          = {http://ai.iit.nrc.ca/II_public/Classification/thesis.pdf},
   abstract     = {},
}
@inProceedings{Scott99,
   author       = {Sam Scott and Stan Matwin},
   title        = {Feature engineering for text classification},
   booktitle    = {Proceedings of ICML-99, 16th International Conference on Machine 
                   Learning},
   editor       = {Ivan Bratko and Saso Dzeroski},
   year         = {1999},
   address      = {Bled, {SL}},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   pages        = {379--388},
   url          = {http://wabakimi.carleton.ca/~sscott2/sam/ICML99_Camera.pdf},
   abstract     = {Most research in text classification to date has used a ``bag of 
                   words'' representation in which each feature corresponds to a 
                   single word. The paper examines some alternative ways to 
                   represent text based on syntactic and semantic relationships 
                   between words (phrases, synonyms and hypernyms). We describe the 
                   new representations and try to justify our hypothesis that they 
                   could improve the performance of a rule based learner. The 
                   representations are evaluated using the RIPPER learning algorithm 
                   on the Reuters-21578 and DigiTrad test corpora. On their own, the 
                   new representations are not found to produce significant 
                   performance improvements. We also try combining classifiers based 
                   on different representations using a majority voting technique, 
                   and this improves performance on both test collections. In our 
                   opinion, more sophisticated natural language processing 
                   techniques need to be developed before better text 
                   representations can be produced for classification.},
}
@inProceedings{Sebastiani99,
   author       = {Fabrizio Sebastiani},
   title        = {A Tutorial on Automated Text Categorisation},
   booktitle    = {Proceedings of ASAI-99, 1st Argentinian Symposium on Artificial 
                   Intelligence},
   editor       = {Analia Amandi and Ricardo Zunino},
   year         = {1999},
   address      = {Buenos Aires, {AR}},
   pages        = {7--35},
   url          = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/ASAI99.pdf},
   note         = {An extended version appears as~\cite{Sebastiani02}},
   abstract     = {The automated categorisation (or classification) of texts into 
                   topical categories has a long history, dating back at least to 
                   1960. Until the late '80s, the dominant approach to the problem 
                   involved knowledge-engineering automatic categorisers, i.e. 
                   manually building a set of rules encoding expert knowledge on how 
                   to classify documents. In the '90s, with the booming production 
                   and availability of on-line documents, automated text 
                   categorisation has witnessed an increased and renewed interest. A 
                   newer paradigm based on machine learning has superseded the 
                   previous approach. Within this paradigm, a general inductive 
                   process automatically builds a classifier by ``learning'', from a 
                   set of previously classified documents, the characteristics of 
                   one or more categories; the advantages are a very good 
                   effectiveness, a considerable savings in terms of expert 
                   manpower, and domain independence. In this tutorial we look at 
                   the main approaches that have been taken towards automatic text 
                   categorisation within the general machine learning paradigm. 
                   Issues of document indexing, classifier construction, and 
                   classifier evaluation, will be touched upon.},
}
@inProceedings{Sebastiani00,
   author       = {Fabrizio Sebastiani and Alessandro Sperduti and Nicola 
                   Valdambrini},
   title        = {An improved boosting algorithm and its application to automated 
                   text categorization},
   booktitle    = {Proceedings of CIKM-00, 9th ACM International Conference on 
                   Information and Knowledge Management},
   address      = {McLean, {US}},
   editor       = {Arvin Agah and Jamie Callan and Elke Rundensteiner},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {2000},
   pages        = {78--85},
   url          = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/CIKM00.pdf},
   abstract     = {We describe {\sc AdaBoost.MH$^{KR}$}, an improved boosting 
                   algorithm, and its application to text categorization. Boosting 
                   is a method for supervised learning which has successfully been 
                   applied to many different domains, and that has proven one of the 
                   best performers in text categorization exercises so far. Boosting 
                   is based on the idea of relying on the collective judgment of a 
                   committee of classifiers that are trained sequentially. In 
                   training the $i$-th classifier special emphasis is placed on the 
                   correct categorization of the training documents which have 
                   proven harder for the previously trained classifiers. {\sc 
                   AdaBoost.MH$^{KR}$} is based on the idea to build, at every 
                   iteration of the learning phase, not a single classifier but a 
                   sub-committee of the $K$ classifiers which, at that iteration, 
                   look the most promising. We report the results of systematic 
                   experimentation of this method performed on the standard {\sf 
                   Reuters-21578} benchmark. These experiments have shown that {\sc 
                   AdaBoost.MH$^{KR}$} is both more efficient to train and more 
                   effective than the original {\sc AdaBoost.MH$^{R}$} algorithm.},
}
@article{Sebastiani02,
   author       = {Fabrizio Sebastiani},
   title        = {Machine learning in automated text categorization},
   journal      = {{ACM} Computing Surveys},
   volume       = {34},
   number       = {1},
   pages        = {1--47},
   year         = {2002},
   url          = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/ACMCS02.pdf},
   abstract     = {The automated categorization (or classification) of texts into 
                   predefined categories has witnessed a booming interest in the 
                   last ten YEARs, due to the increased availability of documents in 
                   digital form and the ensuing need to organize them. In the 
                   research community the dominant approach to this problem is based 
                   on machine learning techniques: a general inductive process 
                   automatically builds a classifier by learning, from a set of 
                   preclassified documents, the characteristics of the categories. 
                   The advantages of this approach over the knowledge engineering 
                   approach (consisting in the manual definition of a classifier by 
                   domain experts) are a very good effectiveness, considerable 
                   savings in terms of expert manpower, and straightforward 
                   portability to different domains. This survey discusses the main 
                   approaches to text categorization that fall within the machine 
                   learning paradigm. We will discuss in detail issues pertaining to 
                   three different problems, namely document representation, 
                   classifier construction, and classifier evaluation.},
}
@article{Shin01,
   author       = {Christian Shin and David Doermann and Azriel Rosenfeld},
   title        = {Classification of document pages using structure-based features},
   journal      = {International Journal on Document Analysis and Recognition},
   number       = {4},
   volume       = {3},
   pages        = {232--247},
   year         = {2001},
   url          = {http://link.springer.de/link/service/journals/10032/papers/1003004/10030232.pdf},
   abstract     = {Searching for documents by their type or genre is a natural way 
                   to enhance the effectiveness of document retrieval. The layout of 
                   a document contains a significant amount of information that can 
                   be used to classify it by type in the absence of domain-specific 
                   models. Our approach to classification is based on "visual 
                   similarity" of layout structure and is implemented by building a 
                   supervised classifier, given examples of each class. We use image 
                   features such as percentages of text and non-text (graphics, 
                   images, tables, and rulings) content regions, column structures, 
                   relative point sizes of fonts, density of content area, and 
                   statistics of features of connected components which can be 
                   derived without class knowledge. In order to obtain class labels 
                   for training samples, we conducted a study where subjects ranked 
                   document pages with respect to their resemblance to 
                   representative page images. Class labels can also be assigned 
                   based on known document types, or can be defined by the user. We 
                   implemented our classification scheme using decision tree 
                   classifiers and self-organizing maps.},
}
@inProceedings{Siolas00,
   author       = {Siolas, Georges and d'Alche-Buc, Florence},
   title        = {Support Vector Machines based on a semantic kernel for text 
                   categorization},
   booktitle    = {Proceedings of IJCNN-00, 11th International Joint Conference on 
                   Neural Networks},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   editor       = {Amari, Shun-Ichi and Giles, C. Lee and Gori, Marco and Piuri, 
                   Vincenzo},
   year         = {2000},
   address      = {Como, {IT}},
   volume       = {5},
   pages        = {205--209},
   url          = {http://dlib.computer.org/conferen/ijcnn/0619/pdf/06193581.pdf},
   abstract     = {We propose to solve a text categorization task using a new metric 
                   between documents, based on a priori semantic knowledge about 
                   words. This metric can be incorporated into the definition of 
                   radial basis kernels of Support Vector Machines or directly used 
                   in a K-nearest neighbors algorithm. Both SVM and KNN are tested 
                   and compared on the 20-newsgroups database. Support Vector 
                   Machines provide the best accuracy on test data.},
}
@article{Skarmeta00,
   author       = {Antonio G\'omez Skarmeta and Amine Bensaid and Nadia Tazi},
   title        = {Data mining for text categorization with semi-supervised 
                   agglomerative hierarchical clustering},
   journal      = {International Journal of Intelligent Systems},
   year         = {2000},
   number       = {7},
   volume       = {15},
   pages        = {633--646},
   url          = {http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=72502965&PLACEBO=IE.pdf},
   abstract     = {In this paper we study the use of a semi-supervised agglomerative 
                   hierarchical clustering (ssAHC) algorithm to text categorization, 
                   which consists of assigning text documents to predefined 
                   categories. ssAHC is (i) a clustering algorithm that (ii) uses a 
                   finite design set of labeled data to (iii) help agglomerative 
                   hierarchical clustering (AHC) algorithms partition a finite set 
                   of unlabeled data and then (iv) terminates without the capability 
                   to label other objects. We first describe the text representation 
                   method we use in this work; we then present a feature selection 
                   method that is used to reduce the dimensionality of the feature 
                   space. Finally, we apply the ssAHC algorithm to the Reuters 
                   database of documents and show that its performance is superior 
                   to the Bayes classifier and to the Expectation-Maximization 
                   algorithm combined with Bayes classifier. We showed also that 
                   ssAHC helps AHC techniques to improve their performance.},
}
@inProceedings{Slattery98,
   author       = {Se{\'{a}}n Slattery and Mark Craven},
   title        = {Combining Statistical and Relational Methods for Learning in 
                   Hypertext Domains},
   booktitle    = {Proceedings of ILP-98, 8th International Conference on Inductive 
                   Logic Programming},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1446},
   editor       = {David Page},
   year         = {1998},
   pages        = {38--52},
   address      = {Madison, {US}},
   url          = {http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-11/www/wwkb/ilp98.ps.gz},
   abstract     = {We present a new approach to learning hypertext classifiers that 
                   combines a statistical text-learning method with a relational 
                   rule learner. This approach is well suited to learning in 
                   hypertext domains because its statistical component allows it to 
                   characterize text in terms of word frequencies, whereas its 
                   relational component is able to describe how neighboring 
                   documents are related to each other by hyperlinks that connect 
                   them. We evaluate our approach by applying it to tasks that 
                   involve learning definitions for (i) classes of pages, (ii) 
                   particular relations that exist between pairs of pages, and (iii) 
                   locating a particular class of information in the internal 
                   structure of pages. Our experiments demonstrate that this new 
                   approach is able to learn more accurate classifiers than either 
                   of its constituent methods alone.},
}
@inProceedings{Slattery00,
   author       = {Se{\'{a}}n Slattery and Mark Craven},
   title        = {Discovering test set regularities in relational domains},
   booktitle    = {Proceedings of ICML-00, 17th International Conference on Machine 
                   Learning},
   editor       = {Pat Langley},
   year         = {2000},
   address      = {Stanford, {US}},
   pages        = {895--902},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www.cs.cmu.edu/~sean/papers/icml2000.ps},
   abstract     = {Machine learning typically involves discovering regularities in a 
                   training set, then applying these learned regularities to 
                   classify objects in a test set. In this paper we present an 
                   approach to discovering additional regularities in the test set, 
                   and show that in relational domains such test set regularities 
                   can be used to improve classification accuracy beyond that 
                   achieved using the training set alone. For example, we have 
                   previously shown how FOIL, a relational learner, can learn to 
                   classify Web pages by discovering training set regularities in 
                   the words occurring on target pages, and on other pages related 
                   by hyperlinks. Here we show how the classification accuracy of 
                   FOIL on this task can be improved by discovering additional 
                   regularities on the test set pages that must be classified. Our 
                   approach can be seen as an extension to Kleinberg¹s Hubs and 
                   Authorities algorithm that analyzes hyperlink relations among Web 
                   pages. We present evidence that this new algorithm leads to 
                   better test set precision and recall on three binary Web 
                   classification tasks where the test set Web pages are taken from 
                   different Web sites than the training set.},
}
@inProceedings{Slonim01,
   author       = {Noam Slonim and Naftali Tishby},
   title        = {The Power of Word Clusters for Text Classification},
   booktitle    = {Proceedings of ECIR-01, 23rd European Colloquium on Information 
                   Retrieval Research},
   editor       = {},
   year         = {2001},
   address      = {Darmstadt, {DE}},
   publisher    = {},
   pages        = {},
   url          = {http://www.cs.huji.ac.il/labs/learning/Papers/irsg3.eps.gz},
   abstract     = {The recently introduced Information Bottleneck method provides an 
                   information theoretic framework, for extracting features of one 
                   variable, that are relevant for the values of another variable. 
                   Several previous works already suggested applying this method for 
                   document clustering, gene expression data analysis, spectral 
                   analysis and more. In this work we present a novel implementation 
                   of this method for supervised text classification. Specifically, 
                   we apply the information bottleneck method to find word-clusters 
                   that preserve the information about document categories and use 
                   these clusters as features for classification. Previous work used 
                   a similar clustering procedure to show that word-clusters can 
                   significantly reduce the feature space dimensionality, with only 
                   a minor change in classification accuracy. In this work we 
                   reproduce these results and go further to show that when the 
                   training sample is small word clusters can yield significant 
                   improvement in classification accuracy (up to 18\%) over the 
                   performance using the words directly.},
}
@inProceedings{Soucy01,
   author       = {Pascal Soucy and Guy W. Mineau},
   title        = {A Simple Feature Selection Method for Text Classification},
   booktitle    = {Proceeding of IJCAI-01, 17th International Joint Conference on 
                   Artificial Intelligence},
   editor       = {Bernhard Nebel},
   address      = {Seattle, {US}},
   year         = {2001},
   pages        = {897--902},
   url          = {},
   abstract     = {In text classification most techniques use bag-of-words to 
                   represent documents. The main problem is to identify what words 
                   are best suited to classify the documents in such a way as to 
                   discriminate between them. Feature selection techniques are then 
                   needed to identify these words. The feature selection method 
                   presented in this paper is rather simple and computationally 
                   efficient. It combines a well known feature selection criterion, 
                   the information gain, and a new algorithm that selects and adds a 
                   feature to a bag-of-words if it does not occur too often with the 
                   features already in a small set composed of the best features 
                   selected so far for their high information gain. In brief, it 
                   tries to avoid considering features whose discrimination 
                   capability is sufficiently covered by already selected features, 
                   reducing in size the set of the features used to characterize the 
                   document set. This paper presents this feature selection method 
                   and its results, and how we have predetermined some of its 
                   parameters through experimentation.},
}
@inProceedings{Soucy01a,
   author       = {Pascal Soucy and Guy W. Mineau},
   title        = {A Simple KNN Algorithm for Text Categorization},
   booktitle    = {Proceedings of ICDM-01, IEEE International Conference on Data 
                   Mining},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   editor       = {Nick Cercone and Tsau Y. Lin and Xindong Wu},
   year         = {2001},
   address      = {San Jose, {CA}},
   pages        = {647--648},
   url          = {},
   abstract     = {},
}
@inProceedings{Soucy03,
   author       = {Pascal Soucy and Guy W. Mineau},
   title        = {Feature Selection Strategies for Text Categorization},
   booktitle    = {Proceeding of CSCSI-03, 16th Conference of the Canadian Society 
                   for Computational Studies of Intelligence},
   editor       = {Yang Xiang and Brahim Chaib-Draa},
   address      = {Halifax, {CA}},
   year         = {2003},
   pages        = {505--509},
   url          = {},
   abstract     = {},
}
@inProceedings{Spitz00,
   author       = {Larry Spitz and Arman Maghbouleh},
   title        = {Text categorization using character shape codes},
   booktitle    = {Proceedings of the 7th SPIE Conference on Document Recognition 
                   and Retrieval},
   publisher    = {{SPIE}, The International Society for Optical Engineering},
   editor       = {Daniel P. Lopresti and Jiangying Zhou},
   year         = {2000},
   address      = {San Jose, {US}},
   pages        = {174--181},
   url          = {},
   abstract     = {Text categorization in the form of topic identification is a 
                   capability of current interest. The paper is concerned with 
                   categorization of electronic document images. Previous work on 
                   the categorization of document images has relied on optical 
                   character recognition (OCR) to provide the transformation between 
                   the image domain and a domain where pattern recognition 
                   techniques are more readily applied. Our work uses a different 
                   technology to provide this transformation. Character shape coding 
                   is a computationally efficient, extraordinarily robust means of 
                   providing access to the character content of document images. 
                   While this transform is lossy, sufficient salient information is 
                   retained to support many applications. Furthermore, the use of 
                   shape coding is particularly advantageous over OCR in the 
                   processing of page images of poor quality. The authors found that 
                   topic identification performance was maintained or slightly 
                   improved using character shape codes derived from images.},
}
@article{Stamatatos00,
   author       = {Efstathios Stamatatos and Nikos Fakotakis and George Kokkinakis},
   title        = {Automatic text categorization in terms of genre and author},
   journal      = {Computational Linguistics},
   pages        = {471--495},
   year         = {2000},
   number       = {4},
   volume       = {26},
   url          = {},
   abstract     = {The two main factors that characterize a text are its content and 
                   its style, and both can be used as a means of categorization. In 
                   this paper we present an approach to text categorization in terms 
                   of genre and author for Modern Greek. In contrast to previous 
                   stylometric approaches, we attempt to take full advantage of 
                   existing natural language processing (NLP) tools. To this end, we 
                   propose a set of style markers including analysis-level measures 
                   that represent the way in which the input text has been analyzed 
                   and capture useful stylistic information without additional cost. 
                   We present a set of small-scale but reasonable experiments in 
                   text genre detection, author identification, and author 
                   verification tasks and show that the proposed method performs 
                   better than the most popular distributional lexical measures, 
                   i.e., functions of vocabulary richness and frequencies of 
                   occurrence of the most frequent words. All the presented 
                   experiments are based on unrestricted text downloaded from the 
                   World Wide Web without any manual text preprocessing or text 
                   sampling. Various performance issues regarding the training set 
                   size and the significance of the proposed style markers are 
                   discussed. Our system can be used in any application that 
                   requires fast and easily adaptable text categorization in terms 
                   of stylistically homogeneous categories. Moreover, the procedure 
                   of defining analysis-level markers can be followed in order to 
                   extract useful stylistic information using existing text 
                   processing tools.},
}
@inProceedings{Sun01,
   author       = {Aixin Sun and Ee-Peng Lim},
   title        = {Hierarchical Text Classification and Evaluation},
   booktitle    = {Proceedings of ICDM-01, IEEE International Conference on Data 
                   Mining},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   editor       = {Nick Cercone and Tsau Y. Lin and Xindong Wu},
   year         = {2001},
   address      = {San Jose, {CA}},
   pages        = {521--528},
   url          = {http://www.cais.ntu.edu.sg:8000/~sunaixin/paper/sun_icdm01.pdf},
   abstract     = {Hierarchical Classification refers to assigning of one or more 
                   suitable categories from a hierarchical category space to a 
                   document. While previous work in hierarchical classification 
                   focused on virtual category trees where documents are assigned 
                   only to the leaf categories, we propose a top-down level-based 
                   classification method that can classify documents to both leaf 
                   and internal categories. As the standard performance measures 
                   assume independence between categories, they have not considered 
                   the documents incorrectly classified into categories that are 
                   similar or not far from the correct ones in the category tree. We 
                   therefore propose the Category-Similarity Measures and 
                   Distance-Based Measures to consider the degree of 
                   misclassification in measuring the classification performance. An 
                   experiment has been carried out to measure the performance of our 
                   proposed hierarchical classification method. The results showed 
                   that our method performs well for Reuters text collection when 
                   enough training documents are given and the new measures have 
                   indeed considered the contributions of misclassified documents.},
}
@inProceedings{Taghva00,
   author       = {Taghva, Kazem and Nartker, Thomas A. and Julie Borsack and Steven 
                   Lumos and Allen Condit and Ron Young},
   title        = {Evaluating text categorization in the presence of OCR errors},
   booktitle    = {Proceedings of the 8th SPIE Conference on Document Recognition 
                   and Retrieval},
   editor       = {Paul B. Kantor and Daniel P. Lopresti and Jiangying Zhou},
   year         = {2000},
   address      = {San Jose, {US}},
   pages        = {68--74},
   publisher    = {SPIE, The International Society for Optical Engineering, 
                   Washington, {US}},
   url          = {},
   abstract     = {In this paper we describe experiments that investigate the 
                   effects of OCR errors on text categorization. In particular, we 
                   show that in our environment, OCR errors have no effect on 
                   categorization when we use a classifier based on the naive Bayes 
                   model. We also observe that dimensionality reduction techniques 
                   eliminate a large number of OCR errors and improve categorization 
                   results.},
}
@inProceedings{Taira99,
   author       = {Hirotoshi Taira and Masahiko Haruno},
   title        = {Feature selection in {SVM} text categorization},
   booktitle    = {Proceedings of AAAI-99, 16th Conference of the American 
                   Association for Artificial Intelligence},
   publisher    = {{AAAI} Press, Menlo Park, {US}},
   editor       = {},
   year         = {1999},
   address      = {Orlando, {US}},
   pages        = {480--486},
   url          = {},
   abstract     = {Investigates the effect of prior feature selection in support 
                   vector machine (SVM) text categorization. The input space was 
                   gradually increased by using mutual information (MI) filtering 
                   and part-of-speech (POS) filtering, which determine the portion 
                   of words that are appropriate for learning from the 
                   information-theoretic and the linguistic perspectives, 
                   respectively. We tested the two filtering methods on SVMs as well 
                   as a decision tree algorithm, C4.5. The SVMs' results common to 
                   both filtering are that 1) the optimal number of features 
                   differed completely across categories, and 2) the average 
                   performance for all categories was best when all of the words 
                   were used. In addition, a comparison of the two filtering methods 
                   clarified that POS filtering on SVMs consistently outperformed MI 
                   filtering, which indicates that SVMs cannot find irrelevant parts 
                   of speech. These results suggest a simple strategy for the SVM 
                   text categorization: use a full number of words found through a 
                   rough filtering technique like part-of-speech tagging.},
}
@inProceedings{Taira01,
   author       = {Hirotoshi Taira and Masahiko Haruno},
   title        = {Text Categorization Using Transductive Boosting},
   booktitle    = {Proceedings of ECML-01, 12th European Conference on Machine 
                   Learning},
   editor       = {Luc De Raedt and Peter A. Flach},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Freiburg, {DE}},
   year         = {2001},
   pages        = {454--465},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2167},
   url          = {http://link.springer.de/link/service/series/0558/papers/2167/21670454.pdf},
   abstract     = {In natural language tasks like text categorization, we usually 
                   have an enormous amount of unlabeled data in addition to a small 
                   amount of labeled data. We present here a transductive boosting 
                   method for text categorization in order to make use of the large 
                   amount of unlabeled data efficiently. Our experiments show that 
                   the transductive method outperforms conventional boosting 
                   techniques that employ only labeled data.},
}
@inProceedings{Takamura01,
   author       = {Hiroya Takamura and Yuji Matsumoto},
   title        = {Feature Space Restructuring for {SVM}s with Application to Text 
                   Categorization},
   booktitle    = {Proceedings of EMNLP-01, 6th Conference on Empirical Methods in 
                   Natural Language Processing},
   year         = {2001},
   publisher    = {Association for Computational Linguistics, Morristown, {US}},
   editor       = {Lillian Lee and Donna Harman},
   pages        = {51--57},
   address      = {Pittsburgh, {US}},
   url          = {http://www.cs.cornell.edu/home/llee/emnlp/papers/takamura.pdf},
   abstract     = {In this paper, we propose a new method of text categorization 
                   based on feature space restructuring for SVMs. In our method, 
                   independent components of document vectors are extracted using 
                   ICA and concatenated with the original vectors. This 
                   restructuring makes it possible for SVMs to focus on the latent 
                   semantic space without losing information given by the original 
                   feature space. Using this method, we achieved high performance in 
                   text categorization both with small number and large numbers of 
                   labeled data.},
}
@inProceedings{Tan01,
   author       = {Ah-Hwee Tan},
   title        = {Predictive Self-Organizing Networks for Text Categorization},
   booktitle    = {Proceedings of PAKDD-01, 5th Pacific-Asia Conferenece on 
                   Knowledge Discovery and Data Mining},
   editor       = {David Cheung and Qing Li and Graham Williams},
   year         = {2001},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Hong Kong, {CN}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2035},
   pages        = {66--77},
   url          = {http://link.springer.de/link/service/series/0558/papers/2035/20350066.pdf},
   abstract     = {This paper introduces a class of predictive self-organizing 
                   neural networks known as Adaptive Resonance Associative Map 
                   (ARAM) for classification of free-text documents. Whereas most 
                   statistical approaches to text categorization derive 
                   classification knowledge based on training examples alone, ARAM 
                   performs supervised learning and integrates user-defined 
                   classification knowledge in the form of IF-THEN rules. Through 
                   our experiments on the Reuters-21578 news database, we showed 
                   that ARAM performed reasonably well in mining categorization 
                   knowledge from sparse and high dimensional document feature 
                   space. In addition, ARAM predictive accuracy and learning 
                   efficiency can be improved by incorporating a set of rules 
                   derived from the Reuters category description. The impact of rule 
                   insertion is most significant for categories with a small number 
                   of relevant documents.},
}
@article{Tan02,
   author       = {Chade-Meng Tan and Yuan-Fang Wang and Chan-Do Lee},
   title        = {The use of bigrams to enhance text categorization},
   journal      = {Information Processing and Management},
   year         = {2002},
   volume       = {38},
   number       = {4},
   pages        = {529--546},
   url          = {http://www.serve.com/cmtan/Meng/ig_m.pdf},
   abstract     = {In this paper, we present an efficient text categorization 
                   algorithm that generates bigrams selectively by looking for ones 
                   that have an especially good chance of being useful. The 
                   algorithm uses the information gain metric, combined with various 
                   frequency thresholds. The bigrams, along with unigrams, are then 
                   given as features to two different classifiers: Naïve Bayes and 
                   maximum entropy. The experimental results suggest that the 
                   bigrams can substantially raise the quality of feature sets, 
                   showing increases in the break-even points and F1 measures. The 
                   McNemar test shows that in most categories the increases are very 
                   significant. Upon close examination of the algorithm, we 
                   concluded that the algorithm is most successful in correctly 
                   classifying more positive documents, but may cause more negative 
                   documents to be classified incorrectly.},
}
@inProceedings{Taskar01,
   author       = {Benjamin Taskar and Eran Segal and Daphne Koller},
   title        = {Probabilistic Classification and Clustering in Relational Data},
   booktitle    = {Proceeding of IJCAI-01, 17th International Joint Conference on 
                   Artificial Intelligence},
   editor       = {Bernhard Nebel},
   address      = {Seattle, {US}},
   year         = {2001},
   pages        = {870--878},
   url          = {http://robotics.stanford.edu/~btaskar/pubs/ijcai01.ps},
   abstract     = {Supervised and unsupervised learning methods have traditionally 
                   focused on data consisting of independent instances of a single 
                   type. However, many real-world domains are best described by 
                   relational models in which instances of multiple types are 
                   related to each other in complex ways. For example, in a 
                   scientific paper domain, papers are related to each other via 
                   citation, and are also related to their authors. In this case, 
                   the label of one entity (e.g., the topic of the paper) is often 
                   correlated with the labels of related entities. We propose a 
                   general class of models for classification and clustering in 
                   relational domains that capture probabilistic dependencies 
                   between related instances. We show how to learn such models 
                   efficiently from data. We present empirical results on two real 
                   world data sets. Our experiments in a transductive classification 
                   setting indicate that accuracy can be significantly improved by 
                   modeling relational dependencies. Our algorithm automatically 
                   induces a very natural behavior, where our knowledge about one 
                   instance helps us classify related ones, which in turn help us 
                   classify others. In an unsupervised setting, our models produced 
                   coherent clusters with a very natural interpretation, even for 
                   instance types that do not have any attributes.},
}
@inProceedings{Taskar02,
   author       = {Ben Taskar and Pieter Abbeel and Daphne Koller},
   title        = {Discriminative probabilistic models of relational data},
   booktitle    = {Proceedings of UAI-02, 18th Conference on Uncertainty in 
                   Artificial Intelligence},
   year         = {2002},
   address      = {Edmonton, {CA}},
   pages        = {485--492},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   editor       = {},
   url          = {},
   abstract     = {In many supervised learning tasks, the entities to be labeled are 
                   related to each other in complex ways and their labels are not 
                   independent. For example, in hypertext classification, the labels 
                   of linked pages are highly correlated. A standard approach is to 
                   classify each entity independently, ignoring the correlations 
                   between them. Recently, Probabilistic Relational Models, a 
                   relational version of Bayesian networks, were used to define a 
                   joint probabilistic model for a collection of related entities. 
                   In this paper, we present an alternative framework that builds on 
                   (conditional) Markov networks and addresses two limitations of 
                   the previous approach. First, undirected models do not impose the 
                   acyclicity constraint that hinders representation of many 
                   important relational dependencies in directed models. Second, 
                   undirected models are well suited for discriminative training, 
                   where we optimize the conditional likelihood of the labels given 
                   the features, which generally improves classification accuracy. 
                   We show how to train these models effectively, and how to use 
                   approximate probabilistic inference over the learned model for 
                   collective classification of multiple related entities. We 
                   provide experimental results on a webpage classification task, 
                   showing that accuracy can be significantly improved by modeling 
                   relational dependencies.},
}
@inProceedings{Tauritz99,
   author       = {Daniel R. Tauritz and Ida G. Sprinkhuizen-Kuyper},
   title        = {Adaptive Information Filtering Algorithms},
   booktitle    = {Proceedings of IDA-99, 3rd Symposium on Intelligent Data Analysis},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1642},
   editor       = {David J. Hand and Joost N. Kok and Michael R. Berthold},
   address      = {Amsterdam, {NL}},
   year         = {1999},
   pages        = {513--524},
   url          = {http://link.springer.de/link/service/series/0558/papers/1642/16420513.pdf},
   abstract     = {Adaptive information filtering is concerned with filtering 
                   information streams in changing environments. The changes may 
                   occur both on the transmission side (the nature of the streams 
                   can change) and on the reception side (the interests of a user 
                   can change). The research described in this paper details the 
                   progress made in a prototype adaptive information filtering 
                   system based on weighted trigram analysis and evolutionary 
                   computation. The main improvements of the algorithms employed by 
                   the system concern the computation of the distance between 
                   weighted trigram vectors and and further analysis of the two-pool 
                   evolutionary algorithm. We tested our new prototype system on the 
                   Reuters-21578 text categorization test collection.},
}
@article{Tauritz00,
   author       = {Daniel R. Tauritz and Joost N. Kok and Ida G. Sprinkhuizen-Kuyper},
   title        = {Adaptive information filtering using evolutionary computation},
   journal      = {Information Sciences},
   year         = {2000},
   volume       = {122},
   number       = {2/4},
   pages        = {121--140},
   url          = {http://www.elsevier.nl/gej-ng/10/23/143/56/27/27/article.pdf},
   abstract     = {Information Filtering is concerned with filtering data streams in 
                   such a way as to leave only pertinent data (information) to be 
                   perused. When the data streams are produced in a changing 
                   environment the filtering has to adapt too in order to remain 
                   effective. Adaptive Information Filtering (AIF) is concerned with 
                   filtering in changing environments. The changes may occur both on 
                   the transmission side (the nature of the streams can change), and 
                   on the reception side (the interest of a user can change). 
                   Weighted trigram analysis is a quick and flexible technique for 
                   describing the contents of a document. A novel application of 
                   evolutionary computation is its use in Adaptive Information 
                   Filtering for optimizing various parameters, notably the weights 
                   associated with trigrams. The research described in this paper 
                   combines weighted trigram analysis, clustering, and a special 
                   two-pool evolutionary algorithm, to create an Adaptive 
                   Information Filtering system with such useful properties as 
                   domain independence, spelling error insensitivity, adaptability, 
                   and optimal use of user feedback while minimizing the amount of 
                   user feedback required to function properly. We designed a 
                   special evolutionary algorithm with a two-pool strategy for this 
                   changing environment.},
}
@inProceedings{Teahan00,
   author       = {William J. Teahan},
   title        = {Text classification and segmentation using minimum cross-entropy},
   booktitle    = {Proceeding of RIAO-00, 6th International Conference ``Recherche 
                   d'Information Assistee par Ordinateur''},
   editor       = {},
   address      = {Paris, {FR}},
   year         = {2000},
   pages        = {},
   url          = {},
   abstract     = {Several methods for classifying and segmenting text are 
                   described. These are based on ranking text sequences by their 
                   cross-entropy calculated using a fixed order character-based 
                   Markov model adapted from the PPM text compression algorithm. 
                   Experimental results show that the methods are a significant 
                   improvement over previously used methods in a number of areas. 
                   For example, text can be classified with a very high degree of 
                   accuracy by authorship, language, dialect and genre. Highly 
                   accurate text segmentation is also possible - the accuracy of the 
                   PPM-based Chinese word segmenter is close to 99\% on Chinese news 
                   text; similarly, a PPM-based method of segmenting text by 
                   language achieves an accuracy of over 99\%.},
}
@inProceedings{Teytaud01,
   author       = {Teytaud, Olivier and Jalam, Radwan},
   title        = {Kernel based text categorization},
   booktitle    = {Proceeding of IJCNN-01, 12th International Joint Conference on 
                   Neural Networks},
   editor       = {},
   address      = {Washington, {US}},
   year         = {2001},
   pages        = {},
   url          = {},
   abstract     = {},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
}
@inProceedings{Theeramunkong02,
   author       = {Thanaruk Theeramunkong and Verayuth Lertnattee},
   title        = {Multi-Dimensional Text Classification},
   booktitle    = {Proceedings of COLING-02, the 19th International Conference on 
                   Computational Linguistics},
   year         = {2002},
   editor       = {},
   pages        = {},
   address      = {Taipei, {TW}},
   url          = {http://acl.ldc.upenn.edu/coling2002/proceedings/data/area-28/co-399.pdf},
   abstract     = {This paper proposes a multi-dimensional framework for classifying 
                   text documents. In this framework, the concept of 
                   multidimensional category model is introduced for representing 
                   classes. In contrast with traditional flat and hierarchical 
                   category models, the multi-dimensional category model classifies 
                   each text document in a collection using multiple predefined sets 
                   of categories, where each set corresponds to a dimension. Since a 
                   multi-dimensional model can be converted to flat and hierarchical 
                   models, three classification strategies are possible, i.e., 
                   classifying directly based on the multi-dimensional model and 
                   classifying with the equivalent flat or hierarchical models. The 
                   efficiency of these three classifications is investigated on two 
                   data sets. Using k-NN, naïve Bayes and centroidbased classifiers, 
                   the experimental results show that the multi-dimensional-based 
                   and hierarchical-based classification performs better than the 
                   flat-based classifications.},
}
@inProceedings{Thompson01,
   author       = {Paul Thompson},
   title        = {Automatic categorization of case law},
   booktitle    = {Proceedings of ICAIL-01, 8th International Conference on 
                   Artificial Intelligence and Law},
   editor       = {},
   year         = {2001},
   address      = {St.\ Louis, {US}},
   pages        = {70--77},
   publisher    = {{ACM} Press, New York, {US}},
   url          = {http://doi.acm.org/10.1145/383535.383543},
   abstract     = {This paper describes a series of automatic text categorization 
                   experiments with case law documents. Cases are categorized into 
                   40 broad, high-level categories. These results are compared to an 
                   existing operational process using Boolean queries manually 
                   constructed by domain experts. In this categorization process 
                   recall is considered more important than precision. This paper 
                   investigates three algorithms that potentially could automate 
                   this categorization process: 1) a nearest neighbor-like 
                   algorithm, 2) C4.5rules, a machine learning decision tree 
                   algorithm; and 3) Ripper, a machine learning rule induction 
                   algorithm. The results obtained by Ripper surpass those of the 
                   operational process.},
}
@inProceedings{Tong92,
   author       = {Richard Tong and Adam Winkler and Pamela Gage},
   title        = {Classification Trees for Document Routing: A Report on the {TREC} 
                   Experiment},
   booktitle    = {Proceedings of TREC-1, 1st Text Retrieval Conference},
   publisher    = {National Institute of Standards and Technology, Gaithersburg, {US}},
   editor       = {Donna K. Harman},
   year         = {1992},
   address      = {Gaithersburg, {US}},
   pages        = {209--228},
   url          = {http://trec.nist.gov/pubs/trec1/papers/17.txt},
   abstract     = {Describes an approach to document routing on the TREC corpus that 
                   employs a technique for the automatic construction of 
                   classification trees. The approach makes use of the 
                   Classification and Regression Trees (CART) algorithm that has 
                   seen application in various areas of machine learning. The 
                   authors' initial work with this algorithm has demonstrated that 
                   probabilistic structures can be automatically acquired from a 
                   training set of documents with respect to a single target 
                   concept, or a set of related concepts. These structures can then 
                   be applied to individual documents to derive a posterior 
                   probability that the document is about a particular target 
                   concept.},
}
@inProceedings{Tong00,
   author       = {Simon Tong and Daphne Koller},
   title        = {Support Vector Machine Active Learning with Applications to Text 
                   Classification},
   booktitle    = {Proceedings of ICML-00, 17th International Conference on Machine 
                   Learning},
   editor       = {Pat Langley},
   year         = {2000},
   address      = {Stanford, {US}},
   pages        = {999--1006},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www.robotics.stanford.edu/~stong/papers/tong_koller_ml00.ps.gz},
   abstract     = {Support vector machines have met with significant success in 
                   numerous real-world learning tasks. However, like most machine 
                   learning algorithms, they are generally applied using a randomly 
                   selected training set classified in advance. In many settings, we 
                   also have the option of using pool-based active learning. Instead 
                   of using a randomly selected training set, the learner has access 
                   to a pool of unlabeled instances and can request the labels for 
                   some number of them. We introduce an new algorithm for performing 
                   active learning with support vector machines, i.e., an algorithm 
                   for choosing which instances to request next. We provide a 
                   theoretical motivation for the algorithm. We present experimental 
                   results showing that employing our active learning method can 
                   significantly reduce the need for labeled training instances in 
                   both the standard inductive and transductive settings.},
   note         = {An extended version appears as \cite{Tong01}},
}
@article{Tong01,
   author       = {Simon Tong and Daphne Koller},
   title        = {Support Vector Machine Active Learning with Applications to Text 
                   Classification},
   journal      = {Journal of Machine Learning Research},
   volume       = {2},
   month        = {November},
   pages        = {45--66},
   year         = {2001},
   url          = {http://www.ai.mit.edu/projects/jmlr/papers/volume2/tong01a/tong01a.pdf},
   abstract     = {Support vector machines have met with significant success in 
                   numerous real-world learning tasks. However, like most machine 
                   learning algorithms, they are generally applied using a randomly 
                   selected training set classified in advance. In many settings, we 
                   also have the option of using pool-based active learning. Instead 
                   of using a randomly selected training set, the learner has access 
                   to a pool of unlabeled instances and can request the labels for 
                   some number of them. We introduce a new algorithm for performing 
                   active learning with support vector machines, i.e., an algorithm 
                   for choosing which instances to request next. We provide a 
                   theoretical motivation for the algorithm using the notion of a 
                   version space. We present experimental results showing that 
                   employing our active learning method can significantly reduce the 
                   need for labeled training instances in both the standard 
                   inductive and transductive settings.},
}
@inProceedings{Toutanova01,
   author       = {Kristina Toutanova and Francine Chen and Kris Popat and Thomas 
                   Hofmann},
   title        = {Text Classification in a Hierarchical Mixture Model for Small 
                   Training Sets},
   booktitle    = {Proceedings of CIKM-01, 10th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {Henrique Paques and Ling Liu and David Grossman},
   year         = {2001},
   address      = {Atlanta, {US}},
   pages        = {105--113},
   url          = {http://www.stanford.edu/~krist/papers/cikm2001.pdf},
   abstract     = {Documents are commonly categorized into hierarchies of topics, 
                   such as the ones maintained by Yahoo! and the Open Directory 
                   project, in order to facilitate browsing and other interactive 
                   forms of information retrieval. In addition, topic hierarchies 
                   can be utilized to overcome the sparseness problem in text 
                   categorization with a large number of categories, which is the 
                   main focus of this paper. This paper presents a hierarchical 
                   mixture model which extends the standard naive Bayes classifier 
                   and previous hierarchical approaches. Improved estimates of the 
                   term distributions are made by differentiation of words in the 
                   hierarchy according to their level of generality/specificity. 
                   Experiments on the Newsgroups and the Reuters-21578 dataset 
                   indicate improved performance of the proposed classifier in 
                   comparison to other state-of-the-art methods on datasets with a 
                   small number of positive examples.},
}
@article{Turney00,
   author       = {Peter D. Turney},
   title        = {Learning Algorithms for Keyphrase Extraction},
   journal      = {Information Retrieval},
   number       = {4},
   volume       = {2},
   pages        = {303--336},
   year         = {2000},
   url          = {http://extractor.iit.nrc.ca/reports/IR2000.ps.Z},
   abstract     = {Many academic journals ask their authors to provide a list of 
                   about five to fifteen keywords, to appear on the first page of 
                   each article. Since these key words are often phrases of two or 
                   more words, we prefer to call them keyphrases. There is a wide 
                   variety of tasks for which keyphrases are useful, as we discuss 
                   in this paper. We approach the problem of automatically 
                   extracting keyphrases from text as a supervised learning task. We 
                   treat a document as a set of phrases, which the learning 
                   algorithm must learn to classify as positive or negative examples 
                   of keyphrases. Our first set of experiments applies the C4.5 
                   decision tree induction algorithm to this learning task. We 
                   evaluate the performance of nine different configurations of 
                   C4.5. The second set of experiments applies the GenEx algorithm 
                   to the task. We developed the GenEx algorithm specifically for 
                   automatically extracting keyphrases from text. The experimental 
                   results support the claim that a custom-designed algorithm 
                   (GenEx), incorporating specialized procedural domain knowledge, 
                   can generate better keyphrases than a general-purpose algorithm 
                   (C4.5). Subjective human evaluation of the keyphrases generated 
                   by Extractor suggests that about 80\% of the keyphrases are 
                   acceptable to human readers. This level of performance should be 
                   satisfactory for a wide variety of applications.},
}
@inProceedings{Tzeras93,
   author       = {Tzeras, Konstadinos and Hartmann, Stephan},
   title        = {Automatic indexing based on {B}ayesian inference networks},
   booktitle    = {Proceedings of SIGIR-93, 16th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Robert Korfhage and Edie Rasmussen and Peter Willett},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Pittsburgh, {US}},
   pages        = {22--34},
   year         = {1993},
   url          = {http://www.darmstadt.gmd.de/~tzeras/FullPapers/gz/Tzeras-Hartmann-93.ps.gz},
   abstract     = {In this paper, a Bayesian inference network model for automatic 
                   indexing with index terms (descriptors) from a prescribed 
                   vocabulary is presented. It requires an indexing dictionary with 
                   rules mapping terms of the respective subject field onto 
                   descriptors and inverted lists for terms occurring in a set of 
                   documents of the subject field and descriptors manually assigned 
                   to these documents. The indexing dictionary can be derived 
                   automatically from a set of manually indexed documents. An 
                   application of the network model is described, followed by an 
                   indexing example and some experimental results about the indexing 
                   performance of the network model.},
}
@article{Uren02,
   author       = {Victoria S. Uren and Thomas R. Addis},
   title        = {How weak categorizers based upon different principles strengthen 
                   performance},
   journal      = {The Computer Journal},
   year         = {2002},
   volume       = {45},
   number       = {5},
   pages        = {511--524},
   url          = {http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_05/pdf/450511.pdf
                   },
   abstract     = {Combining the results of classifiers has shown much promise in 
                   machine learning generally. However, published work on combining 
                   text categorizers suggests that, for this particular application, 
                   improvements in performance are hard to attain. Explorative 
                   research using a simple voting system is presented and discussed 
                   in the light of a probabilistic model that was originally 
                   developed for safety critical software. It was found that typical 
                   categorization approaches produce predictions which are too 
                   similar for combining them to be effective since they tend to 
                   fail on the same records. Further experiments using two less 
                   orthodox categorizers are also presented which suggest that 
                   combining text categorizers can be successful, provided the 
                   essential element of 'difference' is considered.},
}
@article{Urena01,
   author       = {L. Alfonso Ure{\~{n}}a-L{\'{o}}pez and Manuel Buenaga and 
                   Jos{\'{e}} M. G{\'{o}}mez},
   title        = {Integrating linguistic resources in {TC} through {WSD}},
   journal      = {Computers and the Humanities},
   year         = {2001},
   number       = {2},
   volume       = {35},
   pages        = {215--230},
   url          = {http://www.wkap.nl/article.pdf?266250},
   abstract     = {Information access methods must be improved to overcome the 
                   information overload that most professionals face nowadays. Text 
                   classification tasks, like text categorization, help the users to 
                   access to the great amount of text they find in the Internet and 
                   their organizations. TC is the classification of documents into a 
                   predefined set of categories. Most approaches to automatic TC are 
                   based on the utilization of a training collection, which is a set 
                   of manually classified documents. Other linguistic resources that 
                   are emerging, like lexical databases, can also be used for 
                   classification tasks. This article describes an approach to TC 
                   based on the integration of a training collection (Reuters-21578) 
                   and a lexical database (WORDNET 1.6) as knowledge sources. 
                   Lexical databases accumulate information on the lexical items of 
                   one or several languages. This information must be filtered in 
                   order to make an effective use of it in our model of TC. This 
                   filtering process is a word sense disambiguation task. WSD is the 
                   identification of the sense of words in context. This task is an 
                   intermediate process in many natural language processing tasks 
                   like machine translation or multilingual information retrieval. 
                   We present the utilization of WSD as an aid for TC. Our approach 
                   to WSD is also based on the integration of two linguistic 
                   resources: a training collection (SEMCOR and Reuters-21578) and a 
                   lexical database (WORDNET 1.6).},
}
@inProceedings{Vert01,
   author       = {Jean-Philippe Vert},
   title        = {Text Categorization Using Adaptive Context Trees},
   booktitle    = {Proceedings of CICLING-01, 2nd International Conference on 
                   Computational Linguistics and Intelligent Text Processing},
   year         = {2001},
   editor       = {Alexander Gelbukh},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Mexico City, {ME}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2004},
   pages        = {423--436},
   url          = {http://link.springer.de/link/service/series/0558/papers/2004/20040423.pdf},
   abstract     = {A new way of representing texts written in natural language is 
                   introduced, as a conditional probability distribution at the 
                   letter level learned with a variable length Markov model called 
                   adaptive context tree model. Text categorization experiments 
                   demonstrates the ability of this representation to catch 
                   information about the semantic content of the text.},
}
@inProceedings{Viechnicki98,
   author       = {Peter Viechnicki},
   title        = {A Performance Evaluation of Automatic Survey Classifiers},
   booktitle    = {Proceedings of ICGI-98, 4th International Colloquium on 
                   Grammatical Inference},
   address      = {Ames, {US}},
   editor       = {Vasant Honavar and Giora Slutzki},
   year         = {1998},
   pages        = {244--256},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1433},
}
@inProceedings{Vinokourov01,
   author       = {Alexei Vinokourov and Mark Girolami},
   title        = {Document Classification Employing the {F}isher Kernel Derived 
                   from Probabilistic Hierarchic Corpus Representations},
   booktitle    = {Proceedings of ECIR-01, 23rd European Colloquium on Information 
                   Retrieval Research},
   editor       = {},
   year         = {2001},
   address      = {Darmstadt, {DE}},
   publisher    = {},
   pages        = {24--40},
   url          = {http://cis.paisley.ac.uk/vino-ci0/fisher_hierarchic.ps},
   abstract     = {This paper demonstrates that the probabilistic corpus model which 
                   emerges from the automatic or unsupervised hierarchical 
                   organisation of a document collection can be further exploited to 
                   create a kernel which boosts the performance of state-of-the-art 
                   Support Vector Machine document classifiers. It is demonstrated 
                   that the performance of such a classifier is further enhanced 
                   when employing the kernel derived from an appropriate hierarchic 
                   mixture model used for partitioning a document corpus rather than 
                   the kernel associated with a at non-hierarchic mixture model. 
                   This has important implications for document classification when 
                   a hierarchic ordering of topics exists. This can be considered as 
                   the effective combination of documents with no topic or class 
                   labels (unlabeled data), labeled documents, and prior domain 
                   knowledge (in the form of the known hierarchic structure), in 
                   providing enhanced document classification performance.},
}
@article{Vinokourov02,
   author       = {Alexei Vinokourov and Mark Girolami},
   title        = {A Probabilistic Framework for the Hierarchic Organisation and 
                   Classification of Document Collections},
   journal      = {Journal of Intelligent Information Systems},
   year         = {2002},
   note         = {Special Issue on Automated Text Categorization},
   volume       = {18},
   number       = {2/3},
   pages        = {153--172},
   url          = {http://www.wkap.nl/article.pdf?391244},
   abstract     = {This paper presents a probabilistic mixture modeling framework 
                   for the hierarchic organisation of document collections. It is 
                   demonstrated that the probabilistic corpus model which emerges 
                   from the automatic or unsupervised hierarchical organisation of a 
                   document collection can be further exploited to create a kernel 
                   which boosts the performance of state-of-the-art Support Vector 
                   Machine document classifiers. It is shown that the performance of 
                   such a classifier is further enhanced when employing the kernel 
                   derived from an appropriate hierarchic mixture model used for 
                   partitioning a document corpus rather than the kernel associated 
                   with a flat non-hierarchic mixture model. This has important 
                   implications for document classification when a hierarchic 
                   ordering of topics exists. This can be considered as the 
                   effective combination of documents with no topic or class labels 
                   (unlabeled data), labeled documents, and prior domain knowledge 
                   (in the form of the known hierarchic structure), in providing 
                   enhanced document classification performance.},
}
@inProceedings{Wang99,
   author       = {Hui Wang and Nguyen H. Son},
   title        = {Text classification using lattice machine},
   booktitle    = {Proceedings of ISMIS-99, 11th International Symposium on 
                   Methodologies for Intelligent Systems},
   editor       = {Andrzej Skowron and Zbigniew W. Ra{\'{s}}},
   pages        = {235--243},
   year         = {1999},
   address      = {Warsaw, {PL}},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1609},
   url          = {},
   abstract     = {},
}
@inProceedings{Wang99a,
   author       = {Ke Wang and Senquiang Zhou and Shiang Chen Liew},
   title        = {Building hierarchical classifiers using class proximity},
   booktitle    = {Proceedings of VLDB-99, 25th International Conference on Very 
                   Large Data Bases},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   editor       = {Malcolm P. Atkinson and Maria E. Orlowska and Patrick Valduriez 
                   and Stanley B. Zdonik and Michael L. Brodie},
   year         = {1999},
   address      = {Edinburgh, {UK}},
   pages        = {363--374},
   url          = {http://www.comp.nus.edu.sg/~wangk/pub/vldb99.ps},
   abstract     = {We explore how to organize a text database hierarchically to aid 
                   better searching and browsing. We propose to exploit the natural 
                   hierarchy of topics, or taxonomy, that many corpora, such as 
                   internet directories, digital libraries, and patent databases 
                   enjoy. In our system, the user navigates through the query 
                   response not as a flat unstructured list, but embedded in the 
                   familiar taxonomy, and annotated with document signatures 
                   computed dynamically with respect to where the user is located at 
                   any time. We show how to update such databases with new documents 
                   with high speed and accuracy. We use techniques from statistical 
                   pattern recognition to efficiently separate the feature words or 
                   discriminants from the noise words at each node of the taxonomy. 
                   Using these, we build a multi-level classifier. At each node, 
                   this classifier can ignore the large number of noise words in a 
                   document. Thus the classifier has a small model size and is very 
                   fast. However, owing to the use of context-sensitive features, 
                   the classifier is very accurate. We report on experiences with 
                   the Reuters newswire benchmark, the US Patent database, and web 
                   document samples from {{\sc Yahoo!}}\.},
}
@inProceedings{Wang01,
   author       = {Ke Wang and Senquiang Zhou and Yu He},
   title        = {Hierarchical Classification of Real Life Documents},
   booktitle    = {Proceedings of the 1st SIAM International Conference on Data 
                   Mining},
   publisher    = {},
   editor       = {},
   year         = {2001},
   address      = {Chicago, {US}},
   pages        = {},
   url          = {http://www.cs.sfu.ca/~wangk/pub/sdm2001.ps},
   abstract     = {},
}
@inProceedings{Wang00,
   author       = {Wang, Wenxian and Meng, Weiyi and Yu, Clement},
   title        = {Concept hierarchy based text database categorization in a 
                   metasearch engine environment},
   booktitle    = {Proceedings of WISE-00, 1st International Conference on Web 
                   Information Systems Engineering},
   editor       = {Li, Qing and Ozsoyoglu, Z. Meral and Wagner, Roland and 
                   Kambayashi, Yahiko and Zhang, Yanchun},
   pages        = {283--290},
   year         = {2000},
   address      = {Hong Kong, {CN}},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   volume       = {1},
   url          = {http://panda.cs.binghamton.edu/~meng/pub.d/wise00.doc},
   abstract     = {Document categorization, as a technique to improve the retrieval 
                   of useful documents, has been extensively investigated. One 
                   important issue in a large-scale meta-search engine is to select 
                   text databases that are likely to contain useful documents for a 
                   given query. We believe that database categorization can be a 
                   potentially effective technique for good database selection, 
                   especially in the Internet environment, where short queries are 
                   usually submitted. In this paper, we propose and evaluate several 
                   database categorization algorithms. This study indicates that, 
                   while some document categorization algorithms could be adopted 
                   for database categorization, algorithms that take into 
                   consideration the special characteristics of databases may be 
                   more effective. Preliminary experimental results are provided to 
                   compare the proposed database categorization algorithms.},
}
@inProceedings{Wei01,
   author       = {Chih-Ping Wei and Yuan-Xin Dong},
   title        = {A Mining-based Category Evolution Approach to Managing Online 
                   Document Categories},
   booktitle    = {Proceedings of HICSS-01, 34th Annual Hawaii International 
                   Conference on System Sciences},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   editor       = {Ralph H. Sprague},
   year         = {2001},
   address      = {Maui, {US}},
   pages        = {},
   url          = {http://dlib.computer.org/conferen/hicss/0981/pdf/09817061.pdf},
   abstract     = {With rapid expansion of the numbers and sizes of text 
                   repositories and improvements in global connectivity, the 
                   quantity of information available online as free-format text is 
                   growing exponentially. Many large organizations create and 
                   maintain huge volumes of textual information online, and there is 
                   a pressing need for support of efficient and effective 
                   information retrieval, filtering, and management. Text 
                   categorization, or the assignment of textual documents to one or 
                   more pre-defined categories based on their content, is an 
                   essential component of efficient management and retrieval of 
                   documents. Previously, research has focused predominantly on 
                   developing or adopting statistical classification or inductive 
                   learning methods for automatically discovering text 
                   categorization patterns for a pre-defined set of categories. 
                   However, as documents accumulate, such categories may not capture 
                   a document's characteristics correctly. In this study, we 
                   proposed a mining-based category evolution (MiCE) technique to 
                   adjust document categories based on existing categories and their 
                   associated documents. Empirical evaluation results indicate that 
                   the proposed technique, MiCE, was more effective than the 
                   category discovery approach and was insensitive to the quality of 
                   original categories.},
}
@article{Weigend99,
   author       = {Andreas S. Weigend and Erik D. Wiener and Jan O. Pedersen},
   title        = {Exploiting hierarchy in text categorization},
   journal      = {Information Retrieval},
   number       = {3},
   volume       = {1},
   pages        = {193--216},
   year         = {1999},
   url          = {http://www.stern.nyu.edu/~aweigend/Research/Papers/TextCategorization/hierarchy.ps},
   abstract     = {With the recent dramatic increase in electronic access to 
                   documents, text categorization-the task of assigning topics to a 
                   given document-has moved to the center of the information 
                   sciences and knowledge management. This article uses the 
                   structure that is present in the semantic space of topics in 
                   order to improve performance in text categorization: according to 
                   their meaning, topics can be grouped together into 
                   ``meta-topics'', e.g., gold, silver, and copper are all metals. 
                   The proposed architecture matches the hierarchical structure of 
                   the topic space, as opposed to a flat model that ignores the 
                   structure. It accommodates both single and multiple topic 
                   assignments for each document. Its probabilistic interpretation 
                   allows its predictions to be combined in a principled way with 
                   information from other sources. The first level of the 
                   architecture predicts the probabilities of the meta-topic groups. 
                   This allows the individual models for each topic on the second 
                   level to focus on finer discriminations within the group. 
                   Evaluating the performance of a two-level implementation on the 
                   Reuters-22173 testbed of newswire articles shows the most 
                   significant improvement for rare classes.},
}
@article{Weiss99,
   author       = {Sholom M. Weiss and Chidanand Apt\'{e} and Fred J. Damerau and 
                   David E. Johnson and Frank J. Oles and Thilo Goetz and Thomas 
                   Hampp},
   title        = {Maximizing text-mining performance},
   journal      = {{IEEE} Intelligent Systems},
   year         = {1999},
   number       = {4},
   volume       = {14},
   pages        = {63--69},
   url          = {http://www.research.ibm.com/dar/papers/pdf/ieee99_mtmp.pdf},
   abstract     = {With the advent of centralized data warehouses, where data might 
                   be stored as electronic documents or as text fields in databases, 
                   text mining has increased in importance and economic value. One 
                   important goal in text mining is automatic classification of 
                   electronic documents. Computer programs scan text in a document 
                   and apply a model that assigns the document to one or more 
                   prespecified topics. Researchers have used benchmark data, such 
                   as the Reuters-21578 test collection, to measure advances in 
                   automated text categorization. Conventional methods such as 
                   decision trees have had competitive, but not optimal, predictive 
                   performance. Using the Reuters collection, we show that adaptive 
                   resampling techniques can improve decision-tree performance and 
                   that relatively small, pooled local dictionaries are effective. 
                   We've applied these techniques to online banking applications to 
                   enhance automated e-mail routing.},
}
@inProceedings{Wermter99a,
   author       = {Stefan Wermter and Garen Arevian and Christo Panchev},
   title        = {Recurrent Neural Network Learning for Text Routing},
   booktitle    = {Proceedings of ICANN-99, 9th International Conference on 
                   Artificial Neural Networks},
   publisher    = {Institution of Electrical Engineers, London, {UK}},
   editor       = {},
   year         = {1999},
   pages        = {898--903},
   address      = {Edinburgh, {UK}},
   url          = {http://www.his.sunderland.ac.uk/ps/icann99.pdf},
   abstract     = {This paper describes new recurrent plausibility networks with 
                   internal recurrent hysteresis connections. These recurrent 
                   connections in multiple layers encode the sequential context of 
                   word sequences. We show how these networks can support text 
                   routing of noisy newswire titles according to different given 
                   categories. We demonstrate the potential of these networks using 
                   an 82,339 word corpus from the Reuters newswire, reaching recall 
                   and precision rates above 92\%. In addition, we carefully analyze 
                   the internal representation using cluster analysis and output 
                   representations using a new surface error technique. In general, 
                   based on the current recall and precision performance, as well as 
                   the detailed analysis, we show that recurrent plausibility 
                   networks hold a lot of potential for developing learning and 
                   robust newswire agents for the internet.},
}
@inProceedings{Wermter99,
   author       = {Stefan Wermter and Christo Panchev and Garen Arevian},
   title        = {Hybrid Neural Plausibility Networks for News Agents},
   booktitle    = {Proceedings of AAAI-99, 16th Conference of the American 
                   Association for Artificial Intelligence},
   publisher    = {{AAAI} Press, Menlo Park, {US}},
   editor       = {},
   year         = {1999},
   pages        = {93--98},
   address      = {Orlando, {US}},
   url          = {http://www.his.sunderland.ac.uk/ps/aaai99.pdf},
   abstract     = {This paper describes a learning news agent HyNeT which uses 
                   hybrid neural network techniques for classifying news titles as 
                   they appear on an internet newswire. Recurrent plausibility 
                   networks with local memory are developed and examined for 
                   learning robust text routing. HyNeT is described for the first 
                   time in this paper. We show that a careful hybrid integration of 
                   techniques from neural network architectures, learning and 
                   information retrieval can reach consistent recall and precision 
                   rates of more than 92\% on an 82,000 word corpus; this is 
                   demonstrated for 10,000 unknown news titles from the Reuters 
                   newswire. This new synthesis of neural networks, learning and 
                   information retrieval techniques allows us to scale up to a 
                   real-world task and demonstrates a lot of potential for hybrid 
                   plausibility networks for semantic text routing agents on the 
                   internet.},
}
@article{Wermter00,
   author       = {Stefan Wermter},
   title        = {Neural Network Agents for Learning Semantic Text Classification},
   journal      = {Information Retrieval},
   number       = {2},
   volume       = {3},
   pages        = {87--103},
   year         = {2000},
   url          = {http://www.his.sunderland.ac.uk/ps/ir4.pdf},
   abstract     = {The research project AgNeT develops Agents for Neural Text 
                   routing in the internet. Unrestricted potentially faulty text 
                   messages arrive at a certain delivery point (e.g. email address 
                   or world wide web address). These text messages are scanned and 
                   then distributed to one of several expert agents according to a 
                   certain task criterium. Possible specific scenarios within this 
                   framework include the learning of the routing of publication 
                   titles or news titles. In this paper we describe extensive 
                   experiments for semantic text routing based on classified library 
                   titles and newswire titles. This task is challenging since 
                   incoming messages may contain constructions which have not been 
                   anticipated. Therefore, the contributions of this research are in 
                   learning and generalizing neural architectures for the robust 
                   interpretation of potentially noisy unrestricted messages. Neural 
                   networks were developed and examined for this topic since they 
                   support robustness and learning in noisy unrestricted real-world 
                   texts. We describe and compare different sets of experiments. The 
                   first set of experiments tests a recurrent neural network for the 
                   task of library title classification. Then we describe a larger 
                   more difficult newswire classification task from information 
                   retrieval. The comparison of the examined models demonstrates 
                   that techniques from information retrieval integrated into 
                   recurrent plausibility networks performed well even under noise 
                   and for different corpora.},
}
@inProceedings{Wermter02,
   author       = {Stefan Wermter and Chihli Hung},
   title        = {Selforganizing classification on the {R}euters news corpus},
   booktitle    = {Proceedings of COLING-02, the 19th International Conference on 
                   Computational Linguistics},
   year         = {2002},
   editor       = {},
   pages        = {},
   address      = {Taipei, {TW}},
   url          = {http://www.his.sunderland.ac.uk/ps/coling-232.pdf},
   abstract     = {In this paper we propose an integration of a selforganizing map 
                   and semantic networks from WordNet for a text classification task 
                   using the new Reuters news corpus. This neural model is based on 
                   significance vectors and benefits from the presentation of 
                   document clusters. The Hypernym relation in WordNet supplements 
                   the neural model in classification. We also analyse the 
                   relationships of news headlines and their contents of the new 
                   Reuters corpus by a series of experiments. This hybrid approach 
                   of neural selforganization and symbolic hypernym relationships is 
                   successful to achieve good classification rates on 100,000 
                   full-text news articles. These results demonstrate that this 
                   approach can scale up to a large real-world task and show a lot 
                   of potential for text classification.},
}
@inProceedings{Wibowo02,
   author       = {Wahyu Wibowo and Hugh E. Williams},
   title        = {Simple and accurate feature selection for hierarchical 
                   categorisation},
   booktitle    = {Proceedings of the 2002 ACM Symposium on Document engineering},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {},
   year         = {2002},
   address      = {McLean, {US}},
   pages        = {111--118},
   url          = {http://doi.acm.org/10.1145/585058.585079},
   abstract     = {Categorisation of digital documents is useful for organisation 
                   and retrieval. While document categories can be a set of 
                   unstructured category labels, some document categories are 
                   hierarchically structured. This paper investigates automatic 
                   hierarchical categorisation and, specifically, the role of 
                   features in the development of more effective categorisers. We 
                   show that a good hierarchical machine learning-based categoriser 
                   can be developed using small numbers of features from 
                   pre-categorised training documents. Overall, we show that by 
                   using a few terms, categorisation accuracy can be improved 
                   substantially: unstructured leaf level categorisation can be 
                   improved by up to 8.6\%, while top-down hierarchical 
                   categorisation accuracy can be improved by up to 12\%. In 
                   addition, unlike other feature selection models --- which 
                   typically require different feature selection parameters for 
                   categories at different hierarchical levels --- our technique 
                   works equally well for all categories in a hierarchical 
                   structure. We conclude that, in general, more accurate 
                   hierarchical categorisation is possible by using our simple 
                   feature selection technique.},
}
@mastersThesis{Wiener95a,
   author       = {Erik D. Wiener},
   title        = {A neural network approach to topic spotting in text},
   school       = {Department of Computer Science, University of Colorado at Boulder},
   address      = {Boulder, {US}},
   year         = {1995},
   url          = {http://www.stern.nyu.edu/~aweigend/Research/Papers/TextCategorization/Wiener_Thesis95.ps},
   abstract     = {This paper presents an application of nonlinear neural networks 
                   to topic spotting. Neural networks allow us to model higher-order 
                   interaction between document terms and to simultaneously predict 
                   multiple topics using shared hidden features. In the context of 
                   this model, we compare two approaches to dimensionality reduction 
                   in representation: one based on term selection and another based 
                   on Latent Semantic Indexing (LSI). Two different methods are 
                   proposed for improving LSI representations for the topic spotting 
                   task. We find that term selection and our modified LSI 
                   representations lead to similar topic spotting performance, and 
                   that this performance is equal to or better than other published 
                   results on the same corpus.},
}
@inProceedings{Wiener95,
   author       = {Erik D. Wiener and Jan O. Pedersen and Andreas S. Weigend},
   title        = {A neural network approach to topic spotting},
   booktitle    = {Proceedings of SDAIR-95, 4th Annual Symposium on Document 
                   Analysis and Information Retrieval},
   publisher    = {},
   editor       = {},
   year         = {1995},
   address      = {Las Vegas, {US}},
   pages        = {317--332},
   url          = {http://www.stern.nyu.edu/~aweigend/Research/Papers/TextCategorization/Wiener.Pedersen.Weigend_SDAIR95.ps},
   abstract     = {This paper presents an application of nonlinear neural networks 
                   to topic spotting. Neural networks allow us to model higher-order 
                   interaction between document terms and to simultaneously predict 
                   multiple topics using shared hidden features. In the context of 
                   this model, we compare two approaches to dimensionality reduction 
                   in representation: one based on term selection and another based 
                   on Latent Semantic Indexing (LSI). Two different methods are 
                   proposed for improving LSI representations for the topic spotting 
                   task. We find that term selection and our modified LSI 
                   representations lead to similar topic spotting performance, and 
                   that this performance is equal to or better than other published 
                   results on the same corpus.},
}
@article{Wong96,
   author       = {Jacqueline W. Wong and Wing-Kay Kan and Gilbert H. Young},
   title        = {{{\sc Action}}: automatic classification for full-text documents},
   journal      = {{SIGIR} Forum},
   year         = {1996},
   volume       = {30},
   number       = {1},
   pages        = {26--41},
   url          = {},
   abstract     = {},
}
@inProceedings{Yamazaki97,
   author       = {Takefumi Yamazaki and Ido Dagan},
   title        = {Mistake-driven Learning with Thesaurus for Text Categorization},
   booktitle    = {Proceedings of NLPRS-97, the Natural Language Processing Pacific 
                   Rim Symposium},
   editor       = {},
   publisher    = {},
   address      = {Phuket, {TH}},
   pages        = {369--374},
   year         = {1997},
   url          = {ftp://www.links.nectec.or.th/pub/NLPRS/paper/dana4r.ps.gz},
   abstract     = {This paper extends the mistake-driven learner WINNOW to better 
                   utilize thesauri for text categorization. In our method not only 
                   words but also semantic categories given by the thesaurus are 
                   used as features in a classifier. New filtering and 
                   disambiguation methods are used as pre-processing to solve the 
                   problems caused by the use of the thesaurus. In order to verify 
                   our methods, we test a large body of tagged Japanese newspaper 
                   articles created by RWCP. Experimental results show that WINNOW 
                   with thesauri attains high accuracy and that the proposed 
                   filtering and disambiguation methods also contribute to the 
                   improved accuracy.},
}
@inProceedings{Yang00b,
   author       = {Hsin-Chang Yang and Chung-Hong Lee},
   title        = {Automatic category generation for text documents by 
                   self-organizing maps},
   booktitle    = {Proceedings of IJCNN-00, 11th International Joint Conference on 
                   Neural Networks},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   editor       = {Amari, Shun-Ichi and Giles, C. Lee and Gori, Marco and Piuri, 
                   Vincenzo},
   year         = {2000},
   address      = {Como, {IT}},
   volume       = {3},
   pages        = {581--586},
   url          = {http://dlib.computer.org/conferen/ijcnn/0619/pdf/06193581.pdf},
   abstract     = {One important task for text data mining is automatic text 
                   categorization, which assigns a text document to some predefined 
                   category according to their correlations. Traditionally, these 
                   categories as well as the correlations among them are determined 
                   bp human experts. In this paper, we devised a novel approach to 
                   automatically generate categories. The self-organizing map model 
                   is used to generate two maps, namely the word cluster map and the 
                   document cluster map, in which a neuron represents a cluster of 
                   words and documents respectively. Our approach is to analyze the 
                   document cluster map to find centroids of some super-clusters. We 
                   also devised a method to select the category term from the word 
                   cluster map. The hierarchical structure of categories may be 
                   generated by recursively applying the same method. Text 
                   categorization is the natural consequence of such automatic 
                   category generation process.},
}
@inProceedings{Yang00c,
   author       = {Hsin-Chang Yang and Chung-Hong Lee},
   title        = {Automatic category structure generation and categorization of 
                   {C}hinese text documents},
   booktitle    = {Proceedings of PKDD-00, 4th European Conference on Principles of 
                   Data Mining and Knowledge Discovery},
   editor       = {Djamel A. Zighed and Jan Komorowski and Jan Zytkow},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1910},
   year         = {2000},
   address      = {Lyon, {FR}},
   pages        = {581--586},
   url          = {http://link.springer.de/link/service/series/0558/papers/1910/19100673.pdf},
   abstract     = {Recently knowledge discovery and data mining in unstructured or 
                   semi-structured texts (text mining) has attracted lots of 
                   attention from both commercial and research fields. One aspect of 
                   text mining is automatic text categorization, which assigns a 
                   text document to some predefined category according to the 
                   correlation between the document and the category. Traditionally, 
                   the categories are arranged in hierarchical manner to achieve 
                   effective searching and indexing, as well as easy comprehension 
                   for humans. The determination of categories and their 
                   hierarchical structures were most done by human experts. The 
                   authors developed an approach to automatically generate 
                   categories and reveal the hierarchical structure among them. We 
                   also used the generated structure to categorize text documents. 
                   The document collection is trained by a self-organizing map to 
                   form two feature maps. We then analyzed the two maps to obtain 
                   the categories and the structure among them. Although the corpus 
                   contains documents written in Chinese, the proposed approach can 
                   be applied to documents written in any language and such 
                   documents can be transformed into a list of separated terms.},
}
@inProceedings{Yang93,
   author       = {Yiming Yang and Christopher G. Chute},
   title        = {An application of {Least Squares Fit} mapping to text information 
                   retrieval},
   booktitle    = {Proceedings of SIGIR-93, 16th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Robert Korfhage and Edie Rasmussen and Peter Willett},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Pittsburgh, {US}},
   pages        = {281--290},
   year         = {1993},
   note         = {An extended version appears as~\cite{Yang94}},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/160688/p281-yang/p281-yang.pdf},
   abstract     = {This paper describes a unique example-based mapping method for 
                   document retrieval. We discovered that the knowledge about 
                   relevance among queries and documents can be used to obtain 
                   empirical connections between query terms and the canonical 
                   concepts which are used for indexing the content of documents. 
                   These connections do not depend on whether there are shared terms 
                   among the queries and documents; therefore, they are especially 
                   effective for a mapping from queries to the documents where the 
                   concepts are relevant but the terms used by article authors 
                   happen to be different from the terms of database users. We 
                   employ a Linear Least Squares Fit (LLSF) technique to compute 
                   such connections from a collection of queries and documents where 
                   the relevance is assigned by humans, and then use these 
                   connections in the retrieval of documents where the relevance is 
                   unknown. We tested this method on both retrieval and indexing 
                   with a set of MEDLINE documents which has been used by other 
                   information retrieval systems for evaluations. The effectiveness 
                   of the LLSF mapping and the significant improvement over 
                   alternative approaches was evident in the tests.},
}
@inProceedings{Yang94a,
   author       = {Yiming Yang},
   title        = {Expert network: effective and efficient learning from human 
                   decisions in text categorisation and retrieval},
   booktitle    = {Proceedings of SIGIR-94, 17th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {W. Bruce Croft and Cornelis J. Van Rijsbergen},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Dublin, {IE}},
   pages        = {13--22},
   year         = {1994},
   url          = {http://www.acm.org/pubs/articles/proceedings/ir/188490/p13-yang/p13-yang.pdf},
   abstract     = {Expert Network (ExpNet) is our approach to automatic 
                   categorization and retrieval of natural language texts. We use a 
                   training set of texts with expert assigned categories to 
                   construct a network which approximately reflects the conditional 
                   probabilities of categories given a text. The input nodes of the 
                   network are words in the training texts, the nodes on the 
                   intermediate level are the training texts, and the output nodes 
                   are categories. The links between nodes are computed based on 
                   statistics of the word distribution and the category distribution 
                   over the training set. ExpNet is used for relevance ranking of 
                   candidate categories of an arbitrary text in the case of text 
                   categorization, and for relevance ranking of documents via 
                   categories in the case of text retrieval. We have evaluated 
                   ExpNet in categorization and retrieval on a document collection 
                   of the MEDLINE database, and observed a performance in recall and 
                   precision comparable to the Linear Least Squares Fit (LLSF) 
                   mapping method, and significantly better than other methods 
                   tested. Computationally, ExpNet has an O(N log N) time complexity 
                   which is much more efficient than the cubic complexity of the 
                   LLSF method. The simplicity of the model, the high recall 
                   precision rates, and the efficient computation together make 
                   ExpNet preferable as a practical solution for real world 
                   applications.},
}
@article{Yang94,
   author       = {Yiming Yang and Christopher G. Chute},
   title        = {An example-based mapping method for text categorization and 
                   retrieval},
   journal      = {{ACM} Transactions on Information Systems},
   year         = {1994},
   number       = {3},
   volume       = {12},
   pages        = {252--277},
   url          = {http://www.acm.org/pubs/articles/journals/tois/1994-12-3/p252-yang/p252-yang.pdf},
   abstract     = {A unified model for text categorization and text retrieval is 
                   introduced. We use a training set of manually categorized 
                   documents to learn word-category associations, and use these 
                   associations to predict the categories of arbitrary documents. 
                   Similarly, we use a training set of queries and their related 
                   documents to obtain empirical associations between query words 
                   and indexing terms of documents, and use these associations to 
                   predict the related documents of arbitrary queries. A linear 
                   least squares fit (LLSF) technique is employed to estimate the 
                   likelihood of these associations. Document collections from the 
                   MEDLINE database and Mayo patient records are used for studies on 
                   the effectiveness of our approach, and on how much the 
                   effectiveness depends on the choices of training data, indexing 
                   language, word-weighting scheme, and morphological 
                   canonicalization. Alternative methods are also tested on these 
                   data collections for comparison. It is evident that the LLSF 
                   approach uses the relevance information effectively within human 
                   decisions of categorization and retrieval, and achieves a 
                   semantic mapping of free texts to their representations in an 
                   indexing language. Such a semantic mapping leads to a significant 
                   improvement in categorization and retrieval, compared to 
                   alternative approaches.},
}
@inProceedings{Yang95,
   author       = {Yiming Yang},
   title        = {Noise reduction in a statistical approach to text categorization},
   booktitle    = {Proceedings of SIGIR-95, 18th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Edward A. Fox and Peter Ingwersen and Raya Fidel},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {1995},
   address      = {Seattle, {US}},
   pages        = {256--263},
   url          = {http://www.cs.cmu.edu/~yiming/papers.yy/sigir95.ps},
   abstract     = {The paper studies noise reduction for computational efficiency 
                   improvements in a statistical learning method for text 
                   categorization, the linear least squares fit (LLSF) mapping. 
                   Multiple noise reduction strategies are proposed and evaluated, 
                   including: an aggressive removal of ``noninformative words'' from 
                   texts before training; the use of a truncated singular value 
                   decomposition to cut off noisy ``latent semantic structures'' 
                   during training; the elimination of noninfluential components in 
                   the LLSF solution (a word concept association matrix) after 
                   training. Text collections in different domains were used for 
                   evaluation. Significant improvements in computational efficiency 
                   without losing categorization accuracy were evident in the 
                   testing results.},
}
@inProceedings{Yang96a,
   author       = {Yiming Yang},
   title        = {An evaluation of statistical approaches to {MEDLINE} indexing},
   booktitle    = {Proceedings of AMIA-96, Fall Symposium of the American Medical 
                   Informatics Association},
   editor       = {James J. Cimino},
   publisher    = {Hanley and Belfus},
   year         = {1996},
   address      = {Washington, {US}},
   pages        = {358--362},
   url          = {http://www.cs.cmu.edu/afs/cs/user/yiming/www/courses/bibliography/papers/scamc96.ps},
   abstract     = {Whether or not high accuracy classification methods can be scaled 
                   to large applications is crucial for the ultimate usefulness of 
                   such methods in text categorization. This paper applies two 
                   statistical learning algorithms, the Linear Least Squares Fit 
                   (LLSF) mapping and a Nearest Neighbor classifier named ExpNet, to 
                   a large collection of MEDLINE documents. With the use of suitable 
                   dimensionality reduction techniques and efficient algorithms, 
                   both LLSF and ExpNet successfully scaled to this very large 
                   problem with a result significantly outperforming word-matching 
                   and other automatic learning methods applied to the same corpus.},
}
@article{Yang96,
   author       = {Yiming Yang and John W. Wilbur},
   title        = {Using corpus statistics to remove redundant words in text 
                   categorization},
   journal      = {Journal of the American Society for Information Science},
   year         = {1996},
   volume       = {47},
   number       = {5},
   pages        = {357--369},
   url          = {http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=57757&PLACEBO=IE.pdf},
   abstract     = {This article studies aggressive word removal in text 
                   categorization to reduce the noise in free texts and to enhance 
                   the computational efficiency of categorization. We use a novel 
                   stop word identification method to automatically generate 
                   domain-specific stoplists which are much larger than a 
                   conventional domain-independent stoplist. In our tests with three 
                   categorization methods on text collections from different 
                   domains/applications, significant numbers of words were removed 
                   without sacrificing categorization effectiveness. In the test of 
                   the Expert Network method on CACM documents, for example, an 87\% 
                   removal of unique words reduced the vocabulary of documents from 
                   8,002 distinct words to 1,045 words, which resulted in a 63\% 
                   time saving and a 74\% memory saving in the computation of 
                   category ranking, with a 10\% precision improvement, on average, 
                   over not using word removal. It is evident in this study that 
                   automated word removal based on corpus statistics has a practical 
                   and significant impact on the computational tractability of 
                   categorization methods in large databases.},
}
@article{Yang96b,
   author       = {Yiming Yang and John W. Wilbur},
   title        = {An analysis of statistical term strength and its use in the 
                   indexing and retrieval of molecular biology texts},
   journal      = {Computers in Biology and Medicine},
   year         = {1996},
   volume       = {26},
   number       = {3},
   pages        = {209--222},
   url          = {},
   abstract     = {The biological literature presents a difficult challenge to 
                   information processing in its complexity, diversity, and in its 
                   sheer volume. Much of the diversity resides in its technical 
                   terminology, which has also become voluminous. In an effort to 
                   deal more effectively with this large vocabulary and improve 
                   information processing, a method of focus has been developed 
                   which allows one to classify terms based on a measure of their 
                   importance in describing the content of the documents in which 
                   they occur. The measurement is called the strength of a term and 
                   is a measure of how strongly the term`s occurrences correlate 
                   with the subjects of documents in the database. If term 
                   occurrences are random then there will be no correlation and the 
                   strength will be zero, but if for any subject, the term is either 
                   always present or never present its strength will be one. We give 
                   here a new, information theoretical interpretation of term 
                   strength, review some of its uses in focusing the processing of 
                   documents for information retrieval and describe new results 
                   obtained in document categorization.},
}
@inProceedings{Yang97,
   author       = {Yiming Yang and Jan O. Pedersen},
   title        = {A comparative study on feature selection in text categorization},
   booktitle    = {Proceedings of ICML-97, 14th International Conference on Machine 
                   Learning},
   editor       = {Douglas H. Fisher},
   year         = {1997},
   address      = {Nashville, {US}},
   pages        = {412--420},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www.cs.cmu.edu/~yiming/papers.yy/ml97.ps},
   abstract     = {This paper is a comparative study of feature selection methods in 
                   statistical learning of text categorization. The focus is on 
                   aggressive dimensionality reduction. Five methods were evaluated, 
                   including term selection based on document frequency (DF), 
                   information gain (IG), mutual information (MI), a 2 -test (CHI), 
                   and term strength (TS). We found IG and CHI most effective in our 
                   experiments. Using IG thresholding with a k-nearest neighbor 
                   classifier on the Reuters corpus, removal of up to 98\% removal 
                   of unique terms actually yielded an improved classification 
                   accuracy (measured by average precision). DF thresholding 
                   performed similarly. Indeed we found strong correlations between 
                   the DF, IG and CHI values of a term. This suggests that DF 
                   thresholding, the simplest method with the lowest cost in 
                   computation, can be reliably used instead of IG or CHI when the 
                   computation of these measures are too expensive. TS compares 
                   favorably with the other methods with up to 50\% vocabulary 
                   reduction but is not competitive at higher vocabulary reduction 
                   levels. In contrast, MI had relatively poor performance due to 
                   its bias towards favoring rare terms, and its sensitivity to 
                   probability estimation errors.},
}
@article{Yang99a,
   author       = {Yiming Yang},
   title        = {An evaluation of statistical approaches to text categorization},
   journal      = {Information Retrieval},
   year         = {1999},
   pages        = {69--90},
   volume       = {1},
   number       = {1/2},
   url          = {http://www.cs.cmu.edu/~yiming/papers.yy/irj99.ps},
   abstract     = {This paper focuses on a comparative evaluation of a wide-range of 
                   text categorization methods, including previously published 
                   results on the Reuters corpus and new results of additional 
                   experiments. A controlled study using three classifiers, kNN, 
                   LLSF and WORD, was conducted to examine the impact of 
                   configuration variations in five versions of Reuters on the 
                   observed performance of classifiers. Analysis and empirical 
                   evidence suggest that the evaluation results on some versions of 
                   Reuters were significantly affected by the inclusion of a large 
                   portion of unlabelled documents, making those results difficult 
                   to interpret and leading to considerable confusions in the 
                   literature. Using the results evaluated on the other versions of 
                   Reuters which exclude the unlabelled documents, the performance 
                   of twelve methods are compared directly or indirectly. For 
                   indirect comparisons, kNN, LLSF and WORD were used as baselines, 
                   since they were evaluated on all versions of Reuters that exclude 
                   the unlabelled documents. As a global observation, kNN, LLSF and 
                   a neural network method had the best performance; except for a 
                   naive Bayes approach, the other learning algorithms also 
                   performed relatively well.},
}
@inProceedings{Yang99,
   author       = {Yiming Yang and Xin Liu},
   title        = {A re-examination of text categorization methods},
   booktitle    = {Proceedings of SIGIR-99, 22nd ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Marti A. Hearst and Fredric Gey and Richard Tong},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Berkeley, {US}},
   year         = {1999},
   pages        = {42--49},
   url          = {http://www.cs.cmu.edu/~yiming/papers.yy/sigir99.ps},
   abstract     = {This paper reports a controlled study with statistical 
                   significance tests on five text categorization methods: the 
                   Support Vector Machines (SVM), a k-Nearest Neighbor (kNN) 
                   classifier, a neural network (NNet) approach, the Linear 
                   Least-squares Fit (LLSF) mapping and a Naive Bayes (NB) 
                   classifier. We focus on the robustness of these methods in 
                   dealing with a skewed category distribution, and their 
                   performance as function of the training-set category frequency. 
                   Our results show that SVM, kNN and LLSF significantly outperform 
                   NNet and NB when the number of positive training instances per 
                   category are small (less than ten), and that all the methods 
                   perform comparably when the categories are sufficiently common 
                   (over 300 instances).},
}
@inProceedings{Yang00a,
   author       = {Yiming Yang and Thomas Ault and Thomas Pierce},
   title        = {Combining multiple learning strategies for effective 
                   cross-validation},
   booktitle    = {Proceedings of ICML-00, 17th International Conference on Machine 
                   Learning},
   editor       = {Pat Langley},
   year         = {2000},
   address      = {Stanford, {US}},
   pages        = {1167--1182},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {http://www.cs.cmu.edu/~yiming/papers.yy/icml00.ps.gz},
   abstract     = {Parameter tuning through cross-validation becomes very difficult 
                   when the validation set contains no or only a few examples of the 
                   classes in the evaluation set. We address this open challenge by 
                   using a combination of classifiers with different performance 
                   characteristics to effectively reduce the performance variance on 
                   average of the overall system across all classes, including those 
                   not seen before. This approach allows us to tune the combination 
                   system on available but less-representative validation data and 
                   obtain smaller performance degradation of this system on the 
                   evaluation data than using a single-method classifier alone. We 
                   tested this approach by applying k-Nearest Neighbor, Rocchio and 
                   Language Modeling classifiers and their combination to the event 
                   tracking problem in the Topic Detection and Tracking (TDT) 
                   domain, where new classes (events) are created constantly over 
                   time, and representative validation sets for new classes are 
                   often difficult to obtain on time. When parameters tuned on an 
                   early benchmark TDT corpus were evaluated on a later TDT 
                   benchmark corpus with no overlapping events, we observed a 
                   38-65\% reduction in tracking cost (a weighted combination of 
                   errors) by the combined system over the individual methods 
                   evaluated under the same conditions, strongly suggesting the 
                   robustness of this approach as a solution for improving 
                   cross-class performance consistency of statistical classifiers 
                   when standard cross-validation fails due to the lack of 
                   representative validation sets.},
}
@inProceedings{Yang00,
   author       = {Yiming Yang and Thomas Ault and Thomas Pierce and Charles W. 
                   Lattimer},
   title        = {Improving text categorization methods for event tracking},
   booktitle    = {Proceedings of SIGIR-00, 23rd ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Nicholas J. Belkin and Peter Ingwersen and Mun-Kew Leong},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Athens, {GR}},
   year         = {2000},
   pages        = {65--72},
   url          = {http://www.cs.cmu.edu/~yiming/papers.yy/sigir00.ps},
   abstract     = {Automated tracking of events from chronologically ordered 
                   document streams is a new challenge for statistical text 
                   classification. Existing learning techniques must be adapted or 
                   improved in order to effectively handle difficult situations 
                   where the number of positive training instances per event is 
                   extremely small, the majority of training documents are 
                   unlabelled, and most of the events have a short duration in time. 
                   We adapted several supervised text categorization methods, 
                   specifically several new variants of the k-Nearest Neighbor (kNN) 
                   algorithm and a Rocchio approach, to track events. All of these 
                   methods showed significant improvement (up to 71\% reduction in 
                   weighted error rates) over the performance of the original kNN 
                   algorithm on TDT benchmark collections, making kNN among the 
                   top-performing systems in the recent TDT3 official evaluation. 
                   Furthermore, by combining these methods, we significantly reduced 
                   the variance in performance of our event tracking system over 
                   different data collections, suggesting a robust solution for 
                   parameter optimization.},
}
@inProceedings{Yang01,
   author       = {Yiming Yang},
   title        = {A Study on Thresholding Strategies for Text Categorization},
   booktitle    = {Proceedings of SIGIR-01, 24th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {W. Bruce Croft and David J. Harper and Donald H. Kraft and Justin 
                   Zobel},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {New Orleans, {US}},
   year         = {2001},
   pages        = {137--145},
   url          = {http://www.cs.cmu.edu/~yiming/papers.yy/sigir01.ps.gz},
   abstract     = {Thresholding strategies in automated text categorization are an 
                   underexplored area of research. This paper presents an 
                   examination of the effect of thresholding strategies on the 
                   performance of a classifier under various conditions. Using 
                   k-Nearest Neighbor (kNN) as the classifier and five evaluation 
                   benchmark collections as the testbets, three common thresholding 
                   methods were investigated, including rank-based thresholding 
                   (RCut), proportion-based assignments (PCut) and score-based local 
                   optimization (SCut); in addition, new variants of these methods 
                   are proposed to overcome significant problems in the existing 
                   approaches. Experimental results show that the choice of 
                   thresholding strategy can significantly influence the performance 
                   of kNN, and that the "optimal" strategy may vary by application. 
                   SCut is potentially better for fine-tuning but risks overfitting. 
                   PCut copes better with rare categories and exhibits a smoother 
                   trade-off in recall versus precision, but is not suitable for 
                   online decision making. RCut is most natural for online response 
                   but is too coarse-grained for global or local optimization. 
                   RTCut, a new method combining the strength of category ranking 
                   and scoring, outperforms both PCut and RCut significantly.},
}
@article{Yang02,
   author       = {Yiming Yang and Se{\'{a}}n Slattery and Rayid Ghani},
   title        = {A Study of Approaches to Hypertext Categorization},
   journal      = {Journal of Intelligent Information Systems},
   year         = {2002},
   note         = {Special Issue on Automated Text Categorization},
   volume       = {18},
   number       = {2/3},
   pages        = {219--241},
   url          = {http://www.wkap.nl/article.pdf?391248},
   abstract     = {Hypertext poses new research challenges for text classification. 
                   Hyperlinks, HTML tags, category labels distributed over linked 
                   documents, and meta data extracted from related Web sites all 
                   provide rich information for classifying hypertext documents. How 
                   to appropriately represent that information and automatically 
                   learn statistical patterns for solving hypertext classification 
                   problems is an open question. This paper seeks a principled 
                   approach to providing the answers. Specifically, we define five 
                   {\em hypertext regularities} which may (or may not) hold in a 
                   particular application domain, and whose presence (or absence) 
                   may significantly influence the optimal design of a classifier. 
                   Using three hypertext datasets and three well-known learning 
                   algorithms (Naive Bayes, Nearest Neighbor, and First Order 
                   Inductive Learner), we examine these regularities in different 
                   domains, and compare alternative ways to exploit them. Our 
                   results show that the identification of hypertext regularities in 
                   the data and the selection of appropriate representations for 
                   hypertext in particular domains are crucial, but seldom obvious, 
                   in real-world problems. We find that adding the words in the 
                   linked neighborhood to the page having those links (both inlinks 
                   and outlinks) were helpful for all our classifiers on one data 
                   set, but more harmful than helpful for two out of the three 
                   classifiers on the remaining datasets. We also observed that 
                   extracting meta data from related Web sites was extremely useful 
                   for improving classification accuracy in some of those domains. 
                   Finally, the relative performance of the classifiers being tested 
                   provided insights into their strengths and limitations for 
                   solving classification problems involving diverse and often noisy 
                   Web pages.},
}
@inProceedings{Yang03,
   author       = {Yiming Yang and Jian Zhang and Bryan Kisiel},
   title        = {A scalability analysis of classifiers in text categorization},
   booktitle    = {Proceedings of SIGIR-03, 26th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Jamie Callan and Gordon Cormack and Charles Clarke and David 
                   Hawking and Alan Smeaton},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Toronto, {CA}},
   year         = {2003},
   pages        = {96--103},
   url          = {http://doi.acm.org/10.1145/860435.860455},
   abstract     = {Real-world applications of text categorization often require a 
                   system to deal with tens of thousands of categories defined over 
                   a large taxonomy. This paper addresses the problem with respect 
                   to a set of popular algorithms in text categorization, including 
                   Support Vector Machines, k-nearest neighbor, ridge regression, 
                   linear least square fit and logistic regression. By providing a 
                   formal analysis of the computational complexity of each 
                   classification method, followed by an investigation on the usage 
                   of different classifiers in a hierarchical setting of 
                   categorization, we show how the scalability of a method depends 
                   on the topology of the hierarchy and the category distributions. 
                   In addition, we are able to obtain tight bounds for the 
                   complexities by using the power law to approximate category 
                   distributions over a hierarchy. Experiments with kNN and SVM 
                   classifiers on the OHSUMED corpus are reported on, as concrete 
                   examples.},
}
@inProceedings{Yavuz98,
   author       = {Yavuz, Tuba and G{\"u}venir, H. Altay},
   title        = {Application of k-nearest neighbor on feature projections 
                   classifier to text categorization},
   booktitle    = {Proceedings of ISCIS-98, 13th International Symposium on Computer 
                   and Information Sciences},
   editor       = {U. Gudukbay and T. Dayar and A. Gursoy and Erol Gelenbe},
   publisher    = {{IOS} Press, Amsterdam, {NL}},
   year         = {1998},
   address      = {Ankara, {TR}},
   pages        = {135--142},
   url          = {ftp://ftp.cs.bilkent.edu.tr/pub/tech-reports/1998/BU-CEIS-9809.ps.z},
   abstract     = {This paper presents the results of the application of an 
                   instance-based learning algorithm k-nearest neighbor method on 
                   feature projections (k-NNFP) to text categorization and compares 
                   it with k-nearest neighbor classifier (k-NN). K-NNFP is similar 
                   to k-NN except it finds the nearest neighbors according to each 
                   feature separately. Then it combines these predictions using a 
                   majority voting. This property causes k-NNFP to eliminate 
                   possible adverse effects of irrelevant features on the 
                   classification accuracy. Experimental evidence indicates that 
                   k-NNFP is superior to k-NN in terms of classification accuracy in 
                   the presence of irrelevant features in many real world domains.},
}
@inProceedings{Yi00,
   author       = {Jeonghee Yi and Neel Sundaresan},
   title        = {A classifier for semi-structured documents},
   booktitle    = {Proceedings of KDD-00, 6th ACM International Conference on 
                   Knowledge Discovery and Data Mining},
   editor       = {},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Boston, {US}},
   year         = {2000},
   pages        = {340--344},
   url          = {http://doi.acm.org/10.1145/347090.347164},
   abstract     = {In this paper, we describe a novel text classifier that can 
                   effectively cope with structured documents. We report experiments 
                   that compare its performance with that of a well-known 
                   probabilistic classifier. Our novel classifier can take advantage 
                   of the information in the structure of document that 
                   conventional, purely term-based classifiers ignore. Conventional 
                   classifiers are mostly based on the vector space model of 
                   document, which views a document simply as an n-dimensional 
                   vector of terms. To retain the information in the structure, we 
                   have developed a structured vector model, which represents a 
                   document with a structured vector, whose elements can be either 
                   terms or other structured vectors. With this extended model, we 
                   also have improved the well-known probabilistic classification 
                   method based on the Bernoulli document generation model. Our 
                   classifier based on these improvements performes significantly 
                   better on pre-classified samples from the web and the US Patent 
                   database than the usual classifiers.},
}
@inProceedings{Yu99,
   author       = {Edmund S. Yu and Elizabeth D. Liddy},
   title        = {Feature selection in text categorization using the {B}aldwin 
                   effect Networks},
   booktitle    = {Proceedings of IJCNN-99, 10th International Joint Conference on 
                   Neural Networks},
   editor       = {},
   publisher    = {{IEEE} Computer Society Press, Los Alamitos, {US}},
   year         = {1999},
   address      = {Washington, {DC}},
   pages        = {2924--2927},
   url          = {},
   abstract     = {Text categorization is the problem of automatically assigning 
                   predefined categories to natural language texts. A major 
                   difficulty of this problem stems from the high dimensionality of 
                   its feature space. Reducing the dimensionality, or selecting a 
                   good subset of features, without sacrificing accuracy, is of 
                   great importance for neural networks to be successfully applied 
                   to the area. In this paper, we propose a neuro-genetic approach 
                   to feature selection in text categorization. Candidate feature 
                   subsets are evaluated by using three-layer feedforward neural 
                   networks. The Baldwin effect concerns the tradeoffs between 
                   learning and evolution. It is used in our research to guide and 
                   improve the GA-based evolution of the feature subsets. 
                   Experimental results show that our neuro-genetic algorithm is 
                   able to perform as well as, if not better than, the best results 
                   of neural networks to date, while using fewer input features.},
}
@inProceedings{Yu98,
   author       = {Kwok L. Yu and Wai Lam},
   title        = {A New On-Line Learning Algorithm for Adaptive Text Filtering},
   booktitle    = {Proceedings of CIKM-98, 7th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {Georges Gardarin and James C. French and Niki Pissinou and Kia 
                   Makki and Luc Bouganim},
   year         = {1998},
   address      = {Bethesda, {US}},
   pages        = {156--160},
   url          = {http://www.acm.org/pubs/articles/proceedings/cikm/288627/p156-yu/p156-yu.pdf},
   abstract     = {Much previous work on text filtering is developed for batch 
                   filtering. They may not perform effectively in adaptive text 
                   filtering which is a more realistic problem. We propose a new 
                   on-line learning algorithm, known as the ATF (Adaptive Text 
                   Filtering) algorithm, to tackle the adaptive filtering problem. 
                   Our approach maintains a pool of selective terms with potentially 
                   high predictive power. The documents are retrieved by considering 
                   both the predicted relevance and its value as a training 
                   observation. The experimental result on the FBIS document corpus 
                   shows that the ATF algorithm outperforms the pure EG 
                   (Exponentiated-gradient) algorithm.},
}
@inProceedings{Xu03,
   author       = {Zhao Xu and Kai Yu and Volker Tresp and Xiaowei Xu and Jizhi Wang},
   title        = {Representative sampling for text classification using support 
                   vector machines},
   booktitle    = {Proceedings of ECIR-03, 25th European Conference on Information 
                   Retrieval},
   publisher    = {Springer Verlag},
   editor       = {Fabrizio Sebastiani},
   address      = {Pisa, {IT}},
   year         = {2003},
   pages        = {393--407},
   url          = {http://link.springer.de/link/service/series/0558/papers/2633/26330393.pdf},
   abstract     = {In order to reduce human efforts, there has been increasing 
                   interest in applying active learning for training text 
                   classifiers. This paper describes a straightforward active 
                   learning heuristic, representative sampling, which explores the 
                   clustering structure of 'uncertain' documents and identifies the 
                   representative samples to query the user opinions, for the 
                   purpose of speeding up the convergence of Support Vector Machine 
                   (SVM) classifiers. Compared with other active learning 
                   algorithms, the proposed representative sampling explicitly 
                   addresses the problem of selecting more than one unlabeled 
                   documents. In an empirical study we compared representative 
                   sampling both with random sampling and with SVM active learning. 
                   The results demonstrated that representative sampling offers 
                   excellent learning performance with fewer labeled documents and 
                   thus can reduce human efforts in text classification tasks.},
}
@inProceedings{Xue03,
   author       = {Dejun Xue and Maosong Sun},
   title        = {Chinese text categorization based on the binary weighting model 
                   with non-binary smoothing},
   booktitle    = {Proceedings of ECIR-03, 25th European Conference on Information 
                   Retrieval},
   publisher    = {Springer Verlag},
   editor       = {Fabrizio Sebastiani},
   address      = {Pisa, {IT}},
   year         = {2003},
   pages        = {408--419},
   url          = {http://link.springer.de/link/service/series/0558/papers/2633/26330408.pdf},
   abstract     = {In Text Categorization (TC) based on the vector space model, 
                   feature weighting is vital for the categorization effectiveness. 
                   Various non-binary weighting schemes are widely used for this 
                   purpose. By emphasizing the category discrimination capability of 
                   features, the paper firstly puts forward a new weighting scheme 
                   TF*IDF*IG. Upon the fact that refined statistics may have more 
                   chance to meet sparse data problem, we re-evaluate the role of 
                   the Binary Weighting Model (BWM) in TC for further consideration. 
                   As a consequence, a novel approach named the Binary Weighting 
                   Model with Non-Binary Smoothing (BWM-NBS) is then proposed so as 
                   to overcome the drawback of BWM. A TC system for Chinese texts 
                   using words as features is implemented. Experiments on a 
                   large-scale Chinese document collection with 71,674 texts show 
                   that the F1 metric of categorization performance of BWM-NBS gets 
                   to 94.9\% in the best case, which is 26.4\% higher than that of 
                   TF*IDF, 19.1\% higher than that of TF*IDF*IG, and 5.8\% higher 
                   than that of BWM under the same condition. Moreover, BWM-NBS 
                   exhibits the strong stability in categorization performance.},
}
@inProceedings{Zaiane02,
   author       = {Osmar R. Za{\"{\i}}ane and Maria-Luiza Antonie},
   title        = {Classifying text documents by associating terms with text 
                   categories},
   booktitle    = {Proceedings of the 13th Australasian Conference on Database 
                   Technologies},
   publisher    = {{ACM} Press, New York, {US}},
   year         = {2002},
   pages        = {215--222},
   address      = {Melbourne, {AU}},
   volume       = {5},
   url          = {},
   note         = {This paper has also been published in \emph{Australian Computer 
                   Science Communications}, 24(2), 2002.},
   abstract     = {Automatic text categorization has always been an important 
                   application and research topic since the inception of digital 
                   documents. Today, text categorization is a necessity due to the 
                   very large amount of text documents that we have to deal with 
                   daily. Many techniques and algorithms for automatic text 
                   categorization have been devised and proposed in the literature. 
                   However, there is still much room for improving the effectiveness 
                   of these classifiers, and new models need to be examined. We 
                   propose herein a new approach for automatic text categorization. 
                   This paper explores the use of association rule mining in 
                   building a text categorization system and proposes a new fast 
                   algorithm for building a text classifier. Our approach has the 
                   advantage of a very fast training phase, and the rules of the 
                   classifier generated are easy to understand and manually 
                   tuneable. Our investigation leads to conclude that association 
                   rule mining is a good and promising strategy for efficient 
                   automatic text categorization.},
}
@inProceedings{Zelikovitz00,
   author       = {Sarah Zelikovitz and Haym Hirsh},
   title        = {Improving Short Text Classification Using Unlabeled Background 
                   Knowledge},
   booktitle    = {Proceedings of ICML-00, 17th International Conference on Machine 
                   Learning},
   editor       = {Pat Langley},
   year         = {2000},
   address      = {Stanford, {US}},
   pages        = {1183--1190},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {ftp://ftp.cs.rutgers.edu/pub/zelikovi/bg1.ps},
   abstract     = {We describe a method for improving the classification of short 
                   text strings using a combination of labeled training data plus a 
                   secondary corpus of unlabeled but related longer documents. We 
                   show that such unlabeled background knowledge can greatly 
                   decrease error rates, particularly if the number of examples or 
                   the size of the strings in the training set is small. This is 
                   particularly useful when labeling text is a labor-intensive job 
                   and when there is a large amount of information available about a 
                   particular problem on the World Wide Web. Our approach views the 
                   task as one of information integration using WHIRL, a tool that 
                   combines database functionalities with techniques from the 
                   information retrieval literature.},
}
@inProceedings{Zelikovitz01,
   author       = {Sarah Zelikovitz and Haym Hirsh},
   title        = {Using {LSI} for Text Classification in the Presence of Background 
                   Text},
   booktitle    = {Proceedings of CIKM-01, 10th ACM International Conference on 
                   Information and Knowledge Management},
   publisher    = {{ACM} Press, New York, {US}},
   editor       = {Henrique Paques and Ling Liu and David Grossman},
   year         = {2001},
   address      = {Atlanta, {US}},
   pages        = {113--118},
   url          = {ftp://ftp.cs.rutgers.edu/pub/zelikovi/lsi01.ps},
   abstract     = {This paper presents work that uses Latent Semantic Indexing (LSI) 
                   for text classification. However, in addition to relying on 
                   labeled training data, we improve classification accuracy by also 
                   using unlabeled data and other forms of available ``background" 
                   text in the classification process. Rather than performing LSI's 
                   singular value decomposition (SVD) process solely on the training 
                   data, we instead use an expanded term-by-document matrix that 
                   includes both the labeled data as well as any available and 
                   relevant background text. We report the performance of this 
                   approach on data sets both with and without the inclusion of the 
                   background text, and compare our work to other efforts that can 
                   incorporate unlabeled data and other background text in the 
                   classification process.},
}
@article{Zhang01,
   author       = {Tong Zhang and Frank J. Oles},
   title        = {Text Categorization Based on Regularized Linear Classification 
                   Methods},
   journal      = {Information Retrieval},
   number       = {1},
   volume       = {4},
   pages        = {5--31},
   year         = {2001},
   url          = {http://www.wkap.nl/article.pdf?335913},
   abstract     = {A number of linear classification methods such as the linear 
                   least squares fit (LLSF), logistic regression, and support vector 
                   machines (SVM¹s) have been applied to text categorization 
                   problems. These methods share the similarity by finding 
                   hyperplanes that approximately separate a class of document 
                   vectors from its complement. However, support vector machines are 
                   so far considered special in that they have been demonstrated to 
                   achieve the state of the art performance. It is therefore 
                   worthwhile to understand whether such good performance is unique 
                   to the SVM design, or if it can also be achieved by other linear 
                   classification methods. In this paper, we compare a number of 
                   known linear classification methods as well as some variants in 
                   the framework of regularized linear systems. We will discuss the 
                   statistical and numerical properties of these algorithms, with a 
                   focus on text categorization. We will also provide some numerical 
                   experiments to illustrate these algorithms on a number of 
                   datasets.},
}
@inProceedings{Zhang03,
   author       = {Dell Zhang and Wee Sun Lee},
   title        = {Question Classification using Support Vector Machines},
   booktitle    = {Proceedings of SIGIR-03, 26th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Jamie Callan and Gordon Cormack and Charles Clarke and David 
                   Hawking and Alan Smeaton},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Toronto, {CA}},
   year         = {2003},
   pages        = {26--32},
   url          = {http://doi.acm.org/10.1145/860435.860443},
   abstract     = {Question classification is very important for question answering. 
                   This paper presents our research work on automatic question 
                   classification through machine learning approaches. We have 
                   experimented with five machine learning algorithms: Nearest 
                   Neighbors (NN), Naïve Bayes (NB), Decision Tree (DT), Sparse 
                   Network of Winnows (SNoW), and Support Vector Machines (SVM) 
                   using two kinds of features: bag-of-words and bag-ofngrams. The 
                   experiment results show that with only surface text features the 
                   SVM outperforms the other four methods for this task. Further, we 
                   propose to use a special kernel function called the tree kernel 
                   to enable the SVM to take advantage of the syntactic structures 
                   of questions. We describe how the tree kernel can be computed 
                   efficiently by dynamic programming. The performance of our 
                   approach is promising, when tested on the questions from the TREC 
                   QA track.},
}
@inProceedings{Zhang03a,
   author       = {Jian Zhang and Rong Jin and Yiming Yang and Alex Hauptmann},
   title        = {Modified Logistic Regression: An Approximation to {SVM} and Its 
                   Applications in Large-Scale Text Categorization},
   booktitle    = {Proceedings of ICML-03, 20th International Conference on Machine 
                   Learning},
   editor       = {},
   year         = {2003},
   address      = {Washington, {DC}},
   pages        = {},
   publisher    = {Morgan Kaufmann Publishers, San Francisco, {US}},
   url          = {},
   abstract     = {},
}
@inProceedings{Zhang03b,
   author       = {Jian Zhang, Yiming Yang},
   title        = {Robustness of regularized linear classification methods in text 
                   categorization},
   booktitle    = {Proceedings of SIGIR-03, 26th ACM International Conference on 
                   Research and Development in Information Retrieval},
   editor       = {Jamie Callan and Gordon Cormack and Charles Clarke and David 
                   Hawking and Alan Smeaton},
   publisher    = {{ACM} Press, New York, {US}},
   address      = {Toronto, {CA}},
   year         = {2003},
   pages        = {190--197},
   url          = {http://doi.acm.org/10.1145/860435.860471},
   abstract     = {Real-world applications often require the classification of 
                   documents under situations of small number of features, 
                   mis-labeled documents and rare positive examples. This paper 
                   investigates the robustness of three regularized linear 
                   classification methods (SVM, ridge regression and logistic 
                   regression) under above situations. We compare these methods in 
                   terms of their loss functions and score distributions, and 
                   establish the connection between their optimization problems and 
                   generalization error bounds. Several sets of controlled 
                   experiments on the Reuters-21578 corpus are conducted to 
                   investigate the robustness of these methods. Our results show 
                   that ridge regression seems to be the most promising candidate 
                   for rare class problems.},
}
@inProceedings{Zhdanova02,
   author       = {Anna V. Zhdanova and Denis V. Shishkin},
   title        = {Classification of Email Queries by Topic: Approach Based on 
                   Hierarchically Structured Subject Domain},
   booktitle    = {Proceedings of IDEAL-02, 3rd International Conference on 
                   Intelligent Data Engineering and Automated Learning},
   editor       = {Hujun Yin and Nigel Allinson and Richard Freeman and John Keane 
                   and Simon Hubbard},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   address      = {Manchester, {UK}},
   year         = {2002},
   pages        = {99--104},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2412},
   url          = {http://link.springer.de/link/service/series/0558/papers/2412/24120099.pdf},
   abstract     = {We describe a Classifier of email queries, which executes text 
                   categorization by topic. The specifics of our Classifier is that 
                   it allows accurate categorization of short messages containing 
                   only a few words. This advantage is achieved by executing 
                   morphological and semantic analyses of an incoming text. 
                   Specifically, the Classifier provides an efficient information 
                   extraction and takes the meaning of words into consideration. By 
                   using the hierarchically structured subject domain and 
                   classification rules, the Classifier's engine assigns an email 
                   query to the most relevant category or categories.},
}
@inProceedings{Zhou00,
   author       = {Shuigeng Zhou and Ye Fan and Jiangtao Hua and Fang Yu and Yunfa 
                   Hu},
   title        = {Hierachically Classifying Chinese Web Documents without 
                   Dictionary Support and Segmentation Procedure},
   booktitle    = {Proceedings of WAIM-00, 1st International Conference on Web-Age 
                   Information Management},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   editor       = {Hongjun Lu and Aoying Zhou},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 1846},
   year         = {2000},
   address      = {Shanghai, {CN}},
   pages        = {215--226},
   url          = {http://link.springer.de/link/service/series/0558/papers/1846/18460215.pdf},
   abstract     = {This paper reports a system that hierarchically classifies 
                   Chinese web documents without dictionary support and segmentation 
                   procedure. In our classifier, Web documents are represented by 
                   N-grams (N$\leq 4$) that are easy to be extracted. A boosting 
                   machine learning approach is applied to classifying Web Chinese 
                   documents that share a topic hierarchy. The open and modularized 
                   system architecture makes our classifier be extendible. 
                   Experimental results show that our system can effectively and 
                   efficiently classify Chinese Web documents.},
}
@inProceedings{Zhou02,
   author       = {Shuigeng Zhou and Jihong Guan},
   title        = {An Approach to Improve Text Classification Efficiency},
   booktitle    = {Proceedings of ADBIS-02, 6th East-European Conference on Advances 
                   in Databases and Information Systems},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   editor       = { Yannis Manolopoulos and Pavol N{\'a}vrat},
   year         = {2002},
   address      = {Bratislava, {SK}},
   pages        = {65--79},
   url          = {http://link.springer.de/link/service/series/0558/papers/2435/24350065.pdf},
   abstract     = {Text classification is becoming more and more important with the 
                   rapid growth of on-line information available. In this paper, we 
                   propose an approach to speedup the process of text classification 
                   based on pruning the training corpus. Effective algorithm for 
                   text corpus pruning is designed. Experiments over real-world text 
                   corpus are carried out, which validates the effectiveness and 
                   efficiency of the proposed approach. Our approach is especially 
                   suitable for applications of on-line text classification.},
}
@inProceedings{Zhou02a,
   author       = {Shuigeng Zhou and Jihong Guan},
   title        = {Chinese Documents Classification Based on {N}-Grams},
   booktitle    = {Proceedings of CICLING-02, 3rd International Conference on 
                   Computational Linguistics and Intelligent Text Processing},
   publisher    = {Springer Verlag, Heidelberg, {DE}},
   editor       = {Alexander F. Gelbukh},
   note         = {Published in the ``Lecture Notes in Computer Science'' series, 
                   number 2276},
   year         = {2002},
   address      = {Mexico City, {MX}},
   pages        = {405--414},
   url          = {http://link.springer.de/link/service/series/0558/papers/2276/22760405.pdf},
   abstract     = {Traditional Chinese documents classifiers are based on keywords 
                   in the documents, which need dictionaries support and efficient 
                   segmentation procedures. This paper explores the techniques of 
                   utilizing N-gram information to categorize Chinese documents so 
                   that the classifier can shake off the burden of large 
                   dictionaries and complex segmentation processing, and 
                   subsequently be domain and time independent. A Chinese documents 
                   classification system following above described techniques is 
                   implemented with Naive Bayes, kNN and hierarchical classification 
                   methods. Experimental results show that our system can achieve 
                   satisfactory performance, which is comparable with other 
                   traditional classifiers.},
}