;; bayes spam filter, based on bayespam.pl 0.9.2 (require 'stdlib) (require 'rep.data.tables) (define good-occurrences (make-table string-hash #'equal)) (define spam-occurrences (make-table string-hash #'equal)) (define nr-of-good-messages 0) (define nr-of-spam-messages 0) (define token-rating (make-table string-hash #'equal)) (define (add-occurrence word table) (cond ((table-ref table word) (table-set table word (1+ (table-ref table word)))) (t (table-set table word 1)))) (define (occurs? word table) (table-bound-p table word)) (define (occurrence-count word table) (if (table-bound-p table word) (table-ref table word) 0)) (define (print-occurrences table) (table-walk (lambda (key value) (printf "%s: %s\n" key value)) table)) (define (set-rating word rating) (table-set token-rating word rating)) (define (print-ratings) (table-walk (lambda (key value) (printf "%s: %s\n" key value)) token-rating)) (define (read-mbox file target count) (let* ((f (open-file file 'read)) (size (file-size file)) (text (read-chars f size)) (dict (string-split "\\s+" text))) (close-file f) (dolist (word dict) (add-occurrence word target)) (set count (length (string-split "\nFrom " text))) (printf "%s: %s messages, %s words\n" file (symbol-value count) (length dict)) '(print-occurrences target))) (define (rate-good word value) (if (occurs? word spam-occurrences) (let ((good-value (* 2 (occurrence-count word good-occurrences))) (bad-value (occurrence-count word spam-occurrences))) (when (>= (+ good-value bad-value) 5) (setq good-value (/ good-value nr-of-good-messages)) (if (> good-value 1) (setq good-value 1)) (setq bad-value (/ bad-value nr-of-spam-messages)) (if (> bad-value 1) (setq bad-value 1)) (setq rating (/ bad-value (+ good-value bad-value))) (if (> rating 99/100) (setq rating 99/100)) (if (< rating 1/100) (setq rating 1/100)) (set-rating word (list (occurrence-count word good-occurrences) (occurrence-count word spam-occurrences) (exact->inexact rating))))) (set-rating word (list (occurrence-count word good-occurrences) (occurrence-count word spam-occurrences) 0.01)))) (define (rate-spam word value) (unless (occurs? word good-occurrences) (if (> (occurrence-count word spam-occurrences) 5) (set-rating word (list (occurrence-count word good-occurrences) (occurrence-count word spam-occurrences) 0.99))))) (define (create-rating-table) (read-mbox "bayes.good" good-occurrences 'nr-of-good-messages) (read-mbox "bayes.spam" spam-occurrences 'nr-of-spam-messages) (table-walk rate-good good-occurrences) (table-walk rate-spam spam-occurrences) (print-ratings)) (define (check message rating-table) 'TODO) (create-rating-table) (check 'message token-rating)