[TriLUG] OT: non-sense spam

davis davis at skink.net
Thu Jan 8 11:19:46 EST 2004


On Thu, Jan 08, 2004 at 11:01:02AM -0500, Marty Ferguson wrote:
> Yes, I have been.
> 
> As far as the "To:" list, I think that it is composed of bogus prefixes
> catenated to legitimate suffixes.  I looked closely at a similar header
> yesterday.
> 
> As far as the GIBBERISH
> I believe that this technique is intended to allow the spam to slip through
> bayesian filters.  The filters need a new heuristic which looks not only at
> frequency of single "words" but also examines context to some extent.

(Saying in a Ed McMahon voice) You are correct sir!

<stuff deleted>

On a related note

I've been stopping spam at my firewall for a while now.  Previously I posted
to this list some code which gleamed ip addresses from spam via perl.  I 
recently rewrote this code using C.  If you are interested, here is
a portion of the code.  If nothing else, it demonstrates the regexp capability
of glibc.  Also, it can harvest the offending ips from 9mb of spam email 
in less than 10 secs on a 60 Mhz sparc.  (Part of the reason why I moved 
from Perl to C was for the speed increase.) I've attached the code of interest
below.  If you want the full code its at:
http://www.skink.net/~davis/progs/c/spam1

Also, its not perfect, but its free.  Comments and suggestions are welcome.

It runs like so:
$./driver filename
where filname is file containing the spam emails.  

#include <netdb.h>
#include <stdio.h>
#include <ctype.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>
#include <regex.h>
#include "tree.h"

/* John F. Davis 2003 */
/* Portions of compile_regexp was taken from Ben Tindale linux gazette
 * regular expression glibc example code. */

/* global */
extern tree_t *rt;
const int DEBUG_TRACE=0;

/*
 Notes about matching lines:

 Possible matching lines:
	Received: from [ip address] (helo=mail42.hostmynetwork.com)
	Received: from [ip address] (helo=USER)

	Received: from hostname ([ip address] helo=mail)
	Received: from hostname ([ip address])
	Received: from hostname ([ip address] helo=liveconsumer.net)

 All we care about is ip.  Try to match just the stuff in brackets.

*/


int check_for_match(regex_t *r, char *line, int *start, int *end)
{
	int rc;
	size_t num_matches=2; /* How many matches are there in a line? */
	regmatch_t *result;

	// Make room for the reult.  Assume 1 match.
	if((result = (regmatch_t *) malloc(sizeof(regmatch_t) * num_matches))==0) {
		perror("No more memory - aaaagh! (Die kicking and screaming.)");
		exit(EXIT_FAILURE);
	}

//   int regexec(const  regex_t  *preg,  
//               const char *string, 
//               size_t nmatch,
//               regmatch_t pmatch[], 
//               int eflags);

	rc = regexec(r,line,num_matches,result,0);
	if (rc == 0) {
		if (DEBUG_TRACE) {
			printf("\t ***match ***\n");
			printf("\t ***%s",line);
			printf("\t ***01234567890123456789012345678901234567890123456789012345678901234567890***\n");
			printf("\t ***result[0].rm_so = %d ***\n",result[0].rm_so);
			printf("\t ***result[0].rm_eo = %d ***\n",result[0].rm_eo);
			printf("\t ***result[1].rm_so = %d ***\n",result[1].rm_so);
			printf("\t ***result[1].rm_eo = %d ***\n",result[1].rm_eo);
		}

		*start = result[1].rm_so;
		*end = result[1].rm_eo;
		
	}
	return rc;
}
int compile_regexp(regex_t *r, char *p) 
{
	int err_no=0; /* For regerror() */

	if((err_no=regcomp(r, p, 0))!=0) /* Compile the regex */
	{
		size_t length;
		char *buffer;
		length = regerror (err_no, r, NULL, 0);
		buffer = malloc(length);
		regerror (err_no, r, buffer, length);
		fprintf(stderr, "%s\n", buffer); /* Print the error */
		free(buffer);
		regfree(r);
		return EXIT_FAILURE;
	}

	//regfree(r); /* Free the regular expression data structure */
	//free(r);
	return EXIT_SUCCESS;

}

enum boolean {false, true};

int main(int argc, char *argv[]) {

	FILE *from;
	int linelen,start,end;
	char inputline[512]; /* input line from file */
	char *ip; /* ip only */
	char *treebuffer;
	regex_t *regex;
	char *pattern=NULL;
	enum boolean looking;
	

	if (argc != 2) {
		printf("Usage: %s <input file>\n", argv[0]);
		exit(99);
	}

	if ((from = fopen(argv[1], "r")) == NULL) {
		perror(argv[1]);
		exit(99);
	}

	pattern = "^Received: from.*\\[\\(.*\\)\\]";
//	printf("pattern = <%s>  \n\n", pattern);

	/* Make space for the regular expression */
	regex = (regex_t *) malloc(sizeof(regex_t));
	if(!regex) {
		printf("out of memory\n");
		exit(0);
	}
	memset(regex, 0, sizeof(regex_t));

	compile_regexp(regex,pattern);


	looking=true;
	while (fgets(inputline,sizeof(inputline),from) != NULL) {
		linelen = strlen(inputline);
		if (check_for_match(regex,inputline,&start,&end) == 0 && looking) {
			//fputs(inputline,stdout);	// debug print out lines first pass
			looking=false;

			/* Add this entry to the tree. */
			ip = inputline+start;		// Move the head to ip addy start.
			inputline[end]=0;	// Trim the end.
			linelen = strlen(ip);	
			treebuffer = (char *) malloc (linelen+1);
			if(!treebuffer) {
				printf("out of memory\n");
				exit(0);
			}
			strcpy(treebuffer,ip);
			if (!rt) rt = stree(rt, rt, treebuffer);// will do this one first.
			else stree(rt,rt,treebuffer);			// later we do this one.

		} else if (linelen == 1) {
			looking=true;	// if its a blank line, start looking again.
		}
	}

	fclose(from);

	regfree(regex); /* Free the regular expression data structure */
	free(regex);


	// Dump the tree
	inorder(rt);

	// Free the tree
	freetree(rt);

	printf("just a check\n");
	inorder(rt);

	goto jumphere;
jumphere:
	exit(0);
}


	




-- 
Happy Trails	

John F. Davis
ABC #6334 1992 R100GSPD                    Durham, North Carolina
http://www.skink.net



More information about the TriLUG mailing list