What is the ID3 algorithm?

ID3 algorithm is a greedy algorithm used to construct a decision tree. ID3 algorithm originated from concept learning system (CLS). It takes the decreasing speed of information entropy as the criterion to select test attributes, that is, it selects the attributes with the highest information gain that have not been used to classify as the criterion at each node, and then continues this process until the generated decision tree can perfectly classify training samples.

ID3 algorithm pseudo-code

ifSample S all belong to the same category CthenCreate a leaf node and mark it with class label C;return;elseThe information gain of each attribute in attribute set F is calculated, assuming that the attribute with the maximum gain value is A. The node is created and attribute A is taken as the decision attribute of the node.forEvery possible value V of node property AdoAdd A new branch to this node, assuming that SV is A subset of the sample whose attribute A is set to V;ifSample SV all belong to the same category CthenAdd a leaf node to the branch and mark the class label as C;elseA recursive call DT(SV, F-{A}) creates A subtree for the branch; endif
   end for
end if

Copy the code

ID3 algorithm Java implementation

public class Id3Util {

    /** * Store attribute name */
    private ArrayList<String> attribute = new ArrayList<>();
    /** * Stores the value of each attribute */
    private ArrayList<ArrayList<Integer>> attributeValue = new ArrayList<>();
    /** * original data */
    private ArrayList<Integer[]> data = new ArrayList<>();
    /** * The index of the decision variable in the attribute set */
    int decatt = 0;
    
    public static final String patternString = "@attribute(.*)[{](.*?) [}]. "";

    private Document document;
    private static Element root;

    public Id3Util(a){
        document = DocumentHelper.createDocument();
        root = document.addElement("root");
        root.addElement("DecisionTree").addAttribute("value"."null");
    }

    /** * Initialize the decision tree */
    public void init(a){
        // Read the arFF file
        readArff(new File("C:\\1.arff"));
        // Set the decision variable
        setDecatt("purchase");
        ArrayList<Integer> attributeIndexList =new ArrayList<>(attribute.size());
        for(int i = 0; i < attribute.size(); i++){
            if(i ! = decatt) { attributeIndexList.add(i); } } ArrayList<Integer> dataIndexList =new ArrayList<>(data.size());
        for(int i = 0; i < data.size(); i++){
            dataIndexList.add(i);
        }
        // Create a decision tree
        this.buildDecisionTree("DecisionTree".null, dataIndexList, attributeIndexList, root);
        // Put the document object
        this.writeXML("C:\\decision_tree.xml");
    }

    /** * Read arff file *@paramFile Arff file */
    private void readArff(File file){
        try {
            FileReader fileReader = new FileReader(file);
            BufferedReader bufferedReader = new BufferedReader(fileReader);
            String line = null;
            Pattern pattern = Pattern.compile(patternString);
            while((line = bufferedReader.readLine()) ! =null){
                Matcher matcher = pattern.matcher(line);
                if(matcher.find()){
                    attribute.add(matcher.group(1).trim());
                    String[] values = matcher.group(2).split(",");
                    ArrayList<Integer> array = new ArrayList<>(values.length);
                    for(String value: values){
                        array.add(Integer.parseInt(value.trim()));
                    }
                    attributeValue.add(array);
                }else if(line.startsWith("@data")) {while((line = bufferedReader.readLine()) ! =null) {if(line == "") {continue;
                        }
                        String[] array = line.split(",");
                        Integer[] row = new Integer[array.length];
                        for(int i = 0; i < array.length; i++){ row[i] = Integer.parseInt(array[i]); } data.add(row); }}else {
                    continue;
                }
            }
            bufferedReader.close();
        } catch(IOException e) { e.printStackTrace(); }}/** * Sets the decision variable *@paramN Decision variable index */
    private void setDecatt(int n) {
        if (n < 0 || n >= attribute.size()) {
            System.out.println("Decision variable set failed");
            System.exit(2);
        }
        decatt = n;
    }

    /** * Sets the decision variable *@paramName Name of the decision variable */
    private void setDecatt(String name) {
        int n = attribute.indexOf(name);
        setDecatt(n);
    }

    /** * Computes the information entropy of the given array *@paramArray an array of *@return* /
    private double getEntropy(int[] array){
        int sum = 0;
        for(int i = 0; i < array.length; i++){
            sum += array[i];
        }
        return getEntropy(array, sum);
    }

    /** * Computes the information entropy of the given array *@paramArray an array of *@paramSum The arithmetic sum * of the given array@return* /
    private double getEntropy(int[] array, int sum) {
        double entropy = 0.0;
        for (int i = 0; i < array.length; i++) {
            if(array[i] == 0) {continue;
            }
            entropy -= ((double) array[i] / sum) * Utils.log2((double) array[i] / sum);
        }
        return entropy;
    }

    /** * Check whether the given dataset belongs to the same category *@paramSubset data set *@returnCheck result */
    private boolean isClassesUnanimous(ArrayList<Integer> subset){
        int count = 1;
        Integer value = data.get(subset.get(0))[decatt];
        for(int i = 1; i < subset.size(); i++){
            Integer next = data.get(subset.get(i))[decatt];
            if(value.equals(next)){ ++count; }}// Calculate the probability of belonging to the same category in the dataset, if the probability is greater than one of the values we set
        // This dataset can be considered to belong to the same category
        double ratio = (double) count / subset.size();
        return ratio >= 0.95 ? true : false;
    }

    /** * Calculate the information entropy of a subset of the original data with the index attribute as the node *@paramSubset Subset index set *@paramIndex Indicates the attribute index *@return* /
    private double calNodeEntropy(ArrayList<Integer> subset, int index){
        // The number of data sets left
        int sum = subset.size();
        double entropy = 0.0;
        int[][] info = new int[attributeValue.get(index).size()][];
        for(int i = 0; i < info.length; i++){
            info[i] = new int[attributeValue.get(decatt).size()];
        }
        int[] count = new int[attributeValue.get(index).size()];
        for(int i = 0; i < sum; i++){
            int n = subset.get(i);
            Integer nodeValue = data.get(n)[index];
            int nodeIndex = attributeValue.get(index).indexOf(nodeValue);
            count[nodeIndex]++;
            Integer decattValue = data.get(n)[decatt];
            int decattIndex = attributeValue.get(decatt).indexOf(decattValue);
            info[nodeIndex][decattIndex]++;
        }
        for(int i = 0; i < info.length; i++){
            entropy += getEntropy(info[i]) * count[i] / sum;
        }
        return entropy;
    }

    /** * Create decision tree *@paramName Node name *@paramThe value value *@paramSubset Specifies the index set of a subset@paramThe index set of the selatt attribute set */
    private void buildDecisionTree(String name, String value, ArrayList<Integer> subset, ArrayList<Integer> selatt, Element parent){
        Element element = null;
        List<Element> list = parent.selectNodes(name);
        Iterator<Element> iterator = list.iterator();
        // Determine the location of element
        while(iterator.hasNext()){
            element = iterator.next();
            if(element.attributeValue("value").equals(value)){
                break; }}// Create a leaf node if the dataset belongs to the same category
        if(isClassesUnanimous(subset)) {
            element.setText(String.valueOf(data.get(subset.get(0))[decatt]));
            return;
        }
        int minIndex = -1;
        int minEntropySelatt = -1;
        double minEntropy = Double.MAX_VALUE;
        // Get the index of the lowest attribute entropy and assign it to minIndex
        for(int i = 1; i < selatt.size(); i++){
            if(i == decatt){
                continue;
            }
            // Calculate the information entropy of each attribute
            double entropy = calNodeEntropy(subset, selatt.get(i));
            if(entropy < minEntropy){
                minEntropySelatt = selatt.get(i);
                minIndex = i;
                minEntropy = entropy;
            }
        }
        String nodeName = attribute.get(minEntropySelatt);
        // Remove this attribute from the property list
        ArrayList<Integer> remainSelatt = removeAttribute(selatt, minIndex);
        // Get the range of values for this property
        ArrayList<Integer> attributeValues = attributeValue.get(minEntropySelatt);
        for(Integer attValue : attributeValues){
            Element newElement = element.addElement(nodeName).addAttribute("value", String.valueOf(attValue));
            ArrayList<Integer> remainSubset = new ArrayList<>();
            for (int i = 0; i < subset.size(); i++){
                if(data.get(subset.get(i))[minEntropySelatt].equals(attValue)){ remainSubset.add(subset.get(i)); }}// If the sample subset is empty, delete this node
            if(remainSubset.size() == 0){
                element.remove(newElement);
                continue;
            }
            // if all subsets belong to the same category, create leaf nodes and label them
            if(isClassesUnanimous(remainSubset)){
                newElement.setText(String.valueOf(data.get(remainSubset.get(0))[decatt]));
                continue;
            }
            // Recursive calls build the treebuildDecisionTree(nodeName, String.valueOf(attValue), remainSubset, remainSelatt, element); }}/** * Deletes the attribute * of the specified index in the property table@paramSelatt Property table *@paramIndex Specifies the index *@return* /
    private ArrayList<Integer> removeAttribute(ArrayList<Integer> selatt, int index) {
        ArrayList<Integer> arrayList = new ArrayList<>(selatt);
        arrayList.remove(index);
        return arrayList;
    }

    /** * writes XML to a file *@paramFileName indicates the fileName */
    private void writeXML(String fileName){
            try {
                File file = new File(fileName);
                if(! file.exists()) { file.createNewFile(); } FileWriter fileWriter =new FileWriter(file);
                OutputFormat outputFormat = OutputFormat.createPrettyPrint();
                XMLWriter writer = new XMLWriter(fileWriter, outputFormat);
                writer.write(document);
                writer.close();
            } catch (IOException e) {
                System.out.println("Failed to write XML file"); }}}Copy the code

Reference: Inductive decision tree ID3 (Java implementation)