On a 1000 sample regex queries extracted from hive: Avg clauses per regex: 52 13 regexes generate more than 1024 clauses : stat1002:/home/dcausse/insource/more_than_1024.txt Code: @Test public void testInsource() throws IllegalArgumentException, TooComplexToDeterminizeException, IOException { Pattern pat = Pattern.compile("insource:/(.*?[^\\\\])/"); int above1024 = 0; int sum = 0; int total = 0; Map regexes = new HashMap<>(); try(BufferedReader br = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream("/insource_queries.txt")))) { String line = null; while((line = br.readLine()) != null) { Matcher m = pat.matcher(line); while(m.find()) { String reg = m.group(1); int maxExpand = 4; int maxStatesTraced = 10000; int maxDeterminizedStates = 20000; int maxNgramsExtracted = 100; try { Automaton automaton = new RegExp(reg.toLowerCase(Locale.ENGLISH), RegExp.ALL ^ RegExp.AUTOMATON).toAutomaton(maxDeterminizedStates); NGramExtractor extractor = new NGramExtractor(3, maxExpand, maxStatesTraced, maxNgramsExtracted); int clauses = extractor.extract(automaton).countClauses(); sum += clauses; total++; if(clauses > 1024) { above1024++; regexes.put(line, clauses); } } catch(IllegalArgumentException iae) { System.out.println(iae); System.out.println(line + " : " + reg); } } } } System.out.println("Regex above : " + above1024); System.out.println("Regex avg clauses : " + sum/total); for(Map.Entry large : regexes.entrySet()) { System.out.println(large.getKey() + ", nb clauses: " + large.getValue()); } } Data set: /home/dcausse/insource/sample.txt