Bitfari WordFilter Social Oracle Walkthrough

A Bitfari Social Oracle is nothing more than a computer program that checks for adequate social behavior. Bitfari joins over 500 social oracles to check the fitness of ads for publication. Aside from these computer programs, Bitfari has three layers of human auditing. Since ads are paid, scanned and audited, the attack surface for spammers and pranksters is little thin.

Please refer to the Java code below. It is a simple string scanner connected to a csv document that first iterates over a message in order to find numbers and special characters. Iterates over all the strings to replace anything but letters, and then scans natural words in every language.

import it.unimi.dsi.fastutil.longs.Long2ObjectMap;
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
import net.openhft.hashing.LongHashFunction;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;

/**
 * Integrated into Bitfari by J.V. Ledesma
 * Originally created by Pim De Witte
 * Performance drastically improved by over an order of magnitude by Thomas G. P. Nappo (Jire).
 * Garbage production has been eliminated as well.
 */
public class SOWordFilter {
	
	static Long2ObjectMap<String[]> words = new Long2ObjectOpenHashMap<>();
	static int largestWordLength = 0;
	
	public static void flag(String word) {
		String[] ignore_in_combination_with_words = new String[]{};
		if (word.length() > largestWordLength) {
			largestWordLength = word.length();
		}
		words.put(LongHashFunction.xx().hashChars(word.replaceAll(" ", "")), ignore_in_combination_with_words);
	}
	
	public static void loadConfigs() {
		try {
			BufferedReader reader = new BufferedReader(new InputStreamReader(new URL("https://docs.google.com/spreadsheets/d/1hIEi2YG3ydav1E06Bzf2mQbGZ12kh2fe4ISgLg_UBuM/export?format=csv").openConnection().getInputStream()));
			String line = "";
			int counter = 0;
			while((line = reader.readLine()) != null) {
				counter++;
				String[] content = null;
				try {
					content = line.split(",");
					if(content.length == 0) {
						continue;
					}
					String word = content[0];
					String[] ignore_in_combination_with_words = new String[]{};
					if(content.length > 1) {
						ignore_in_combination_with_words = content[1].split("_");
					}
					
					if(word.length() > largestWordLength) {
						largestWordLength = word.length();
					}
					words.put(LongHashFunction.xx().hashChars(word.replace(" ", "")), ignore_in_combination_with_words);
					
				} catch(Exception e) {
					e.printStackTrace();
				}
				
			}
			System.out.println("Loaded " + counter + " words to filter across all languages");
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	private static final char[][] convert = {
			{'o', '0'},
			{'i', '1'},
			{'l', '1'},
			{'t', '+'},
			{'e', '3'},
			{'i', '!'},
			{'l', '!'},
			{'s', '$'},
			{'a', '&'},
			{'a', '@'},
			{'c', '('},
			{'d', ')'},
			{'d', '0'},
			{'g', '6'},
			{'t', '7'},
			{'g', '9'},
			{'s', '5'},
			{'a', '4'}
	};
	
	private static final ThreadLocal<StringBuilder> sb = ThreadLocal.withInitial(StringBuilder::new); // make this regular if you don't need thread safety.
	
	/**
	 * Iterates over a String input and checks whether a cuss word was found in a list, then checks if the word should be ignored (e.g. bass contains the word *ss).
	 *
	 * @param input
	 * @return
	 */
	public static boolean badWordsFound(String input) {
		if (input == null) {
			return false;
		}
		
		StringBuilder sb = BadWords.sb.get();
		sb.setLength(0);
		
		removeLeetspeak:
		for (int i = 0; i < input.length(); i++) {
			char c = input.charAt(i);
			if (Character.isLetter(c)) {
				sb.append(Character.toLowerCase(c));
			} else {
				for (char[] conversion : convert) {
					if (c == conversion[1]) {
						sb.append(conversion[0]);
						continue removeLeetspeak;
					}
				}
			}
		}
		
		// iterate over each letter in the word
		for (int start = 0; start < sb.length(); start++) {
			// from each letter, keep going to find bad words until either the end of the sentence is reached, or the max word length is reached.
			for (int offset = 1; offset < (sb.length() + 1 - start) && offset < largestWordLength; offset++) {
				long hash = LongHashFunction.xx().hashChars(sb, start, offset);
				if (words.containsKey(hash)) {
					// for example, if you want to say the word bass, that should be possible.
					String[] ignoreCheck = words.get(hash);
					boolean ignore = false;
					for (int s = 0; s < ignoreCheck.length; s++) {
						if (indexOf(sb, ignoreCheck[s]) >= 0) {
							ignore = true;
							break;
						}
					}
					if (!ignore) {
						return true;
					}
				}
			}
		}
		
		return false;
	}
	
	private static int indexOf(CharSequence source, CharSequence target) {
		int sourceCount = source.length();
		int targetCount = target.length();
		int sourceOffset = 0;
		int targetOffset = 0;
		
		if (0 >= sourceCount) {
			return (targetCount == 0 ? sourceCount : -1);
		}
		if (targetCount == 0) {
			return 0;
		}
		
		char first = target.charAt(targetOffset);
		int max = sourceOffset + (sourceCount - targetCount);
		
		for (int i = sourceOffset; i <= max; i++) {
			/* Look for first character. */
			if (source.charAt(i) != first) {
				while (++i <= max && source.charAt(i) != first);
			}
			
			/* Found first character, now look at the rest of v2 */
			if (i <= max) {
				int j = i + 1;
				int end = j + targetCount - 1;
				for (int k = targetOffset + 1; j < end && source.charAt(j)
						== target.charAt(k); j++, k++);
				
				if (j == end) {
					/* Found whole string. */
					return i - sourceOffset;
				}
			}
		}
		return -1;
	}
	
}

Important caveats include provisions to allow words that include bad words, like the word bass for example. This is achieved with a list of special cases that should be allowed. Additionally, for performance improvements, the words are tallied by length, providing an index by which the program knows when to stop iterating.

Such social oracle programs are executed both in ad composing clients as well as in mining programs to filter ads adequately. Executing social oracles gurantees that auditors will only be focused on high quality ads and reduces the surface of attack for any bad actors.