-- 作者:DMman
-- 发布时间:7/4/2007 5:55:00 PM
-- 在自己的算法中调用Weka实现文本分类的一个例子[DMman整理]
1 介绍:嵌入式机器学习,在自己的算法中调用Weka现文本分类,是一个小的数据挖掘程序,虽然实用价值不是很大,但对于Weka的理解和使用是有帮助的。本例子来自《数据挖掘:实用机器学习技术》第2版 (好像是倒数第三章)。大家可以到 http://www.ieee.org.cn/dispbbs.asp?boardID=69&ID=47510 下载该书察看对算法的详细解释。算法中作了详细的注释,虽然是英文的,但还是比较简单。下面对例子的使用作了浅显的介绍,有兴趣的朋友可以研究。(DMman整理) 2 功能:使用weka中的j48分类器实现了文本分类的一个小程序。文本文件通过weka的过滤器StringToWordVector预处理。 3 注意:把weka.jar加入你的classpath中,才可以通过编译。 4 使用方法: 命令行参数: -t 文本文件路径 -m 你的模型文件路径 -c 可选,类别(hit 或 miss) 如果提供了-c则用于训练,否则被模型分类,输出该文本的类型(hit或miss) 模型是动态建立的,第一次使用命令行必须指定-c参数,才可以建立模型。 1) 建立模型 >java MessageClassifier -t data/1.bmp -m myModel -c hit 可以看到myModel建立了。然后继续训练一下这个模型。使用的文本实例越多,模型的分类性能越好 >java MessageClassifier -t data/2.bmp -m myModel -c hit >java MessageClassifier -t data/1.gif -m myModel -c miss ...... 2) 使用模型分类 有了模型,就可以使用它为文本文件分类了,如 >java MessageClassifier -t data/2.gif -m myModel 可以看到模型对它的分类 3) 可以使用提供-c参数的命令继续完善模型 原文件MessageClassifier .java 可直接通过编译运行 因为命令行有参数 所以在cmd下运行比较好 /** * Java program for classifying text messages into two classes. */ import weka.core.Attribute; import weka.core.Instance; import weka.core.Instances; import weka.core.FastVector; import weka.core.Utils; import weka.classifiers.Classifier; import weka.classifiers.trees.J48; import weka.filters.Filter; import weka.filters.unsupervised.attribute.StringToWordVector; import java.io.*; public class MessageClassifier implements Serializable { /* The training data gathered so far. */ private Instances m_Data = null; /* The filter used to generate the word counts. */ private StringToWordVector m_Filter = new StringToWordVector(); /* The actual classifier. */ private Classifier m_Classifier = new J48(); /* Whether the model is up to date. */ private boolean m_UpToDate; /** * Constructs empty training dataset. */ public MessageClassifier() throws Exception { String nameOfDataset = "MessageClassificationProblem"; // Create vector of attributes. FastVector attributes = new FastVector(2); // Add attribute for holding messages. attributes.addElement(new Attribute("Message", (FastVector)null)); // Add class attribute. FastVector classValues = new FastVector(2); classValues.addElement("miss"); classValues.addElement("hit"); attributes.addElement(new Attribute("Class", classValues)); // Create dataset with initial capacity of 100, and set index of class. m_Data = new Instances(nameOfDataset, attributes, 100); m_Data.setClassIndex(m_Data.numAttributes() - 1); } /** * Updates data using the given training message. */ public void updateData(String message, String classValue) throws Exception { // Make message into instance. Instance instance = makeInstance(message, m_Data); // Set class value for instance. instance.setClassValue(classValue); // Add instance to training data. m_Data.add(instance); m_UpToDate = false; } /** * Classifies a given message. */ public void classifyMessage(String message) throws Exception { // Check whether classifier has been built. if (m_Data.numInstances() == 0) { ////throw new Exception("No classifier available."); } // Check whether classifier and filter are up to date. if (!m_UpToDate) { // Initialize filter and tell it about the input format. m_Filter.setInputFormat(m_Data); // Generate word counts from the training data. Instances filteredData = Filter.useFilter(m_Data, m_Filter); // Rebuild classifier. m_Classifier.buildClassifier(filteredData); m_UpToDate = true; } // Make separate little test set so that message // does not get added to string attribute in m_Data. Instances testset = m_Data.stringFreeStructure(); // Make message into test instance. Instance instance = makeInstance(message, testset); // Filter instance. m_Filter.input(instance); Instance filteredInstance = m_Filter.output(); // Get index of predicted class value. double predicted = m_Classifier.classifyInstance(filteredInstance); // Output class value. System.err.println("Message classified as : " + m_Data.classAttribute().value((int)predicted)); } /** * Method that converts a text message into an instance. */ private Instance makeInstance(String text, Instances data) { // Create instance of length two. Instance instance = new Instance(2); // Set value for message attribute Attribute messageAtt = data.attribute("Message"); instance.setValue(messageAtt, messageAtt.addStringValue(text)); // Give instance access to attribute information from the dataset. instance.setDataset(data); return instance; } /** * Main method. */ public static void main(String[] options) { try { // Read message file into string. String messageName = Utils.getOption('t', options); if (messageName.length() == 0) { throw new Exception("Must provide name of message file."); } FileReader m = new FileReader(messageName); StringBuffer message = new StringBuffer(); int l; while ((l = m.read()) != -1) { message.append((char)l); } m.close(); // Check if class value is given. String classValue = Utils.getOption('c', options); // If model file exists, read it, otherwise create new one. String modelName = Utils.getOption('m', options); if (modelName.length() == 0) { throw new Exception("Must provide name of model file."); } MessageClassifier messageCl; try { ObjectInputStream modelInObjectFile = new ObjectInputStream(new FileInputStream(modelName)); messageCl = (MessageClassifier) modelInObjectFile.readObject(); modelInObjectFile.close(); } catch (FileNotFoundException e) { messageCl = new MessageClassifier(); } // Check if there are any options left Utils.checkForRemainingOptions(options); // Process message. if (classValue.length() != 0) { messageCl.updateData(message.toString(), classValue); } else { messageCl.classifyMessage(message.toString()); } // Save message classifier object. ObjectOutputStream modelOutObjectFile = new ObjectOutputStream(new FileOutputStream(modelName)); modelOutObjectFile.writeObject(messageCl); modelOutObjectFile.close(); } catch (Exception e) { e.printStackTrace(); } } } |
下载源码: [此贴子已经被作者于2007-11-27 14:17:26编辑过]
|