Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/cluecode/copyrights.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from commoncode.text import toascii
from commoncode.text import unixlinesep
from textcode.gibberish import Gibberish
from pygmars import lex
from pygmars import parse
from pygmars import Token
Expand Down Expand Up @@ -60,6 +61,7 @@ def logger_debug(*args):
if TRACE_DEEP:
logger_debug = print


"""
Detect and collect copyright statements.

Expand Down Expand Up @@ -197,6 +199,7 @@ def detect_copyrights_from_lines(
if TRACE or TRACE_DEEP:
logger_debug(f'\n========================================================================')
logger_debug(f'detect_copyrights_from_lines: processing candidate_lines group:')

for can in candidate_lines:
logger_debug(f' {can}')

Expand Down Expand Up @@ -4265,6 +4268,8 @@ def strip_balanced_edge_parens(s):

is_only_digit_and_punct = re.compile('^[^A-Za-z]+$').match

gibberish_detector = Gibberish()


def is_candidate(prepared_line):
"""
Expand All @@ -4282,6 +4287,11 @@ def is_candidate(prepared_line):

return False

if gibberish_detector.detect_gibberish(prepared_line):
if TRACE:
logger_debug(f'is_candidate: gibberish_detector.detect_gibberish:\n{prepared_line!r}')
return False

if copyrights_hint.years(prepared_line):
return True
else:
Expand Down
5 changes: 5 additions & 0 deletions src/textcode/data/gibberish/bad.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
zxcvwerjasc
nmnjcviburili,<>
zxcvnadtruqe
ertrjiloifdfyyoiu
grty iuewdiivjh
128,457 changes: 128,457 additions & 0 deletions src/textcode/data/gibberish/big.txt

Large diffs are not rendered by default.

Binary file added src/textcode/data/gibberish/gib_model.pki
Binary file not shown.
100 changes: 100 additions & 0 deletions src/textcode/data/gibberish/good.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
Copyright (c) All Rights Reserved. Hair Plus Trading Co., Inc.
South Baylo University Copyright (c) All Right Reserved.
Created by shazron on 11-06-15. Copyright 2011 . All rights reserved.
Copyright (c) All Rights Reserved 2014-2019 New Avenue Foundation.
'Copyright 2017 AllThingsTalk'
Copyright (C) All Rights Are Reserved. Chungjungwon​. Iotacoffee.Com 2011
copyright(c) All rights reserved localism,Inc.
Crown Copyright C All rights reserved.
copyright(c) All rights reserved istyle Inc.
[assembly: AssemblyCopyright(""Copyright © 2013"")]
<span>Copyright (C) All Rights Reserved </span> <span>2007-2020版权所有: 镇江日报社 </span>
Copyright (c) - All Rights Reserved - PROAIM Medical.
Copyright (c), ALL Consulting, 2008
Created by Samvel Khalatyan, May 28, 2013 Copyright 2013, All rights reserved
Iotacoffee.Com 2011 Copyright (C) All Rights Are Reserved.
Copyright (C) All Rights Reserved, Lei Connection Inc.
Copyright(c) All Saints Episcopal Church, Fort Worth, 2011, church based at 3290 Lackland Road,, Fort Worth, TX 76116
* Created by claudio beatrice on 2/21/10. Copyright 2010. All rights reserved.
Copyright(c) All rights reserved by Minds, Japan Council for Quality Health Care.
Copyright (C) All Rights Reserved by Leh. www.leh.jp
Copyright (C) All rights Reserved by 株式会社 朝日住宅社
/* For iOS video I/O
* by Eduard Feicho on 29/07/12
* Copyright 2012. All rights reserved.
// Copyright (c) 2002-2010, Industrial Light & Magic, a division of Lucas
// Digital Ltd. LLC
//
// All rights reserved.
Copyright (c) 2006, Industrial Light & Magic, a division of Lucasfilm
Entertainment Company Ltd. Portions contributed and copyright held by
others as indicated. All rights reserved.
copyright__ = 'Copyright 2017 AllThingsTalk'
Copyright EAVISE
UCL are copyrighted software distributed
Foursquare © 2019
Copyright (C) 2019, by Djilani CARDINEAU.
# Copyright michimani All rights reserved.
Copyright(c) All Rights Reserved by Chinese Service Center for Scholarly Exchange
Copyright(c) All right reserved SSC. Ltd.
Third party copyrights are property of their respective owners.
Copyright (c) All Rights Reserved by the District Export Council of Georgia.
//COPYRIGHT
//
//All contributions by the University of California:
//Copyright (c) 2014, The Regents of the University of California (Regents)
//All rights reserved.
//
//All other contributions:
//Copyright (c) 2014, the respective contributors
//All rights reserved.
//
//Caffe uses a shared copyright model: each contributor holds copyright over
//their contributions to Caffe. The project versioning records all such
//contribution and copyright details. If a contributor wants to further mark
//their specific copyright on a particular contribution, they should indicate
//their copyright solely in the commit message of the change when it is
//committed.
//
//LICENSE
Copyright (C) 2013 Opensim Ltd.
#COPYRIGHT
#
#All contributions by the University of California:
#Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
#All rights reserved.
#
#All other contributions:
#Copyright (c) 2014, 2015, the respective contributors
#All rights reserved.
LICENSE: Copyright 2016, All Rights Reserved
(a)Download original face detection dataset -> (b)Convert annotation to the PASCAL VOC format -> (c)Create LMDB database with images + annotations for training
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
Copyright (C), 2001-2011, Acme Tech. Co. Ltd.
* libtiff/{tif_dirinfo.c, tif_dir.h, tif_dir.c, tif_print.c}: Make
DocumentName, Artist, HostComputer, ImageDescription, Make, Model,
Copyright, DateTime, PageName, TextureFormat, TextureWrapModes and
TargetPrinter tags custom.
COPYRIGHT (C) All About, Inc. All Rights Reserved.
Copyright 2019, All Rights Reserved. # Author: Pine <[email protected]>
* For iOS video I/O
* by Eduard Feicho on 29/07/12
* by Alexander Shishkov on 17/07/13
* Copyright 2012. All rights reserved.
COPYRIGHT(C) ALL JAPAN PRO-WRESTLING Co., Ltd.
:copyright: Copyright (c) Joe Joyce and contributors, 2016-2019.
Copyright 2014 uh-sem-blee, Co.
Copyright (c) 2016 the Authors
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
* For iOS video I/O
* by Xiaochao Yang on 06/15/11 modified from
* cap_qtkit.mm for Nicholas Butko for Mac OS version.
* Copyright 2011. All rights reserved.
Copyright (c) All the Raige Dog Salon. All Rights Reserved.
[assembly: AssemblyCopyright(""Copyright © 2014"")]
<a href="http://www.enox.biz/">Copyright (C) All rights Reserved by 株式会社エノックス</a>
2008 Nuance Communications
Copyright 2008 TJ
Scilab (c)INRIA-ENPC
Copyright (c) 2006, FUJITA Yuji
125 changes: 125 additions & 0 deletions src/textcode/gibberish.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#!/usr/bin/python
#
# From: https://raw.githubusercontent.com/yapus/gibberish/01637fe1fda827529ca76b8d6fee2de9100719f1/gibberish/gibberish.py
#
# 12Jun2017 Petr Janata - added srcfile and outfile
# 17Jun2107 Petr Janata - expanded set of accepted characters to include digits and hyphen
#
# whch is based off of:
# https://raw.githubusercontent.com/rrenaud/Gibberish-Detector/aa1d4e4555362b3dada97ebe6ecc23a84fc470fe/gib_detect_train.py
#

import math
import pickle
from pathlib import Path

data_dir = Path(__file__).parent / 'data' / 'gibberish'
model_path = data_dir / 'gib_model.pki'
big_file_path = data_dir / 'big.txt'
good_file_path = data_dir / 'good.txt'
bad_file_path = data_dir / 'bad.txt'

accepted_chars = 'abcdefghijklmnopqrstuvwxyz0123456789- '
pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)])


class Gibberish(object):
def __init__(self):
if model_path.exists():
self.load_persisted_model()
else:
self.train()

def persist_model(self):
with open(model_path, 'wb') as f:
pickle.dump(vars(self), f)

def load_persisted_model(self):
with open(model_path, 'rb') as f:
persisted_model = pickle.load(f)
for key, value in persisted_model.items():
setattr(self, key, value)

def normalize(self, line):
""" Return only the subset of chars from accepted_chars.
This helps keep the model relatively small by ignoring punctuation,
infrequenty symbols, etc. """
return [c.lower() for c in line if c.lower() in accepted_chars]

def ngram(self, n, l):
""" Return all n grams from l after normalizing """
filtered = self.normalize(l)
for start in range(0, len(filtered) - n + 1):
yield ''.join(filtered[start:start + n])

def avg_transition_prob(self, l, log_prob_mat):
""" Return the average transition prob from l through log_prob_mat. """
log_prob = 0.0
transition_ct = 0
for a, b in self.ngram(2, l):
log_prob += log_prob_mat[pos[a]][pos[b]]
transition_ct += 1
# The exponentiation translates from log probs to probs.
return math.exp(log_prob / (transition_ct or 1))

def train(self, bigfile=big_file_path, goodfile=good_file_path,
badfile=bad_file_path):
""" Write a simple model as a pickle file """
k = len(accepted_chars)
# Assume we have seen 10 of each character pair. This acts as a kind of
# prior or smoothing factor. This way, if we see a character transition
# live that we've never observed in the past, we won't assume the entire
# string has 0 probability.
counts = [[10 for i in range(k)] for i in range(k)]

# Count transitions from big text file, taken
# from http://norvig.com/spell-correct.html
for line in open(bigfile):
for a, b in self.ngram(2, line):
counts[pos[a]][pos[b]] += 1

# Normalize the counts so that they become log probabilities.
# We use log probabilities rather than straight probabilities to avoid
# numeric underflow issues with long texts.
# This contains a justification:
# http://squarecog.wordpress.com/2009/01/10/dealing-with-underflow-in-joint-probability-calculations/
for i, row in enumerate(counts):
s = float(sum(row))
for j in range(len(row)):
row[j] = math.log(row[j] / s)

# Find the probability of generating a few arbitrarily choosen good and
# bad phrases.
good_probs = [self.avg_transition_prob(l, counts) for l in open(goodfile)]
bad_probs = [self.avg_transition_prob(l, counts) for l in open(badfile)]

# Assert that we actually are capable of detecting the junk.
assert min(good_probs) > max(bad_probs)

# And pick a threshold halfway between the worst good and best bad inputs.
thresh = (min(good_probs) + max(bad_probs)) / 2
self.mat = counts
self.thresh = thresh
self.persist_model()

def detect_gibberish(self, text):
text = ''.join(self.normalize(text))
return self.avg_transition_prob(text, self.mat) < self.thresh

def percent_gibberish(self, text):
text = ''.join(self.normalize(text))
text = text.strip()
words = text.split(' ')
if len(words) == 0:
return 0

gibberish_count = 0
for word in words:
if self.detect_gibberish(word):
gibberish_count += 1

return float(gibberish_count) / float(len(words))

def gibberish_pct(self, text):
text = ''.join(self.normalize(text))
return self.avg_transition_prob(text, self.mat)
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,3 @@ what:
- copyrights
- holders
- authors
copyrights:
- (c) (c) 2AICAA3SSY
holders:
- 2AICAA3SSY
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,3 @@ what:
- copyrights
- holders
- authors
copyrights:
- U1e (c) IjAx
holders:
- U1e IjAx
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,3 @@ what:
- copyrights
- holders
- authors
copyrights:
- Xz eaaeuyATNRU (c) Ijr
holders:
- Xz eaaeuyATNRU Ijr
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,3 @@ what:
- copyrights
- holders
- authors
copyrights:
- (c) cc.fr
holders:
- cc.fr
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,3 @@ what:
- copyrights
- holders
- authors
copyrights:
- (c) Oo2 UOY
holders:
- Oo2 UOY
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,3 @@ what:
- copyrights
- holders
- authors
copyrights:
- I. (c) Uao
holders:
- I. Uao
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,3 @@ what:
- copyrights
- holders
- authors
copyrights:
- (c) UOSSOO-O (c)
holders:
- UOSSOO-O
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,3 @@ what:
- copyrights
- holders
- authors
copyrights:
- (c) Cj d Dj
holders:
- Cj d Dj
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,3 @@ what:
- copyrights
- holders
- authors
copyrights:
- (c) ,33 ,BD(b.Xb(c+1),33) d d BVc
holders:
- BD(b.Xb(c+1),33) d d BVc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,3 @@ what:
- copyrights
- holders
- authors
copyrights:
- (c) q ltd
holders:
- q ltd
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,3 @@ what:
- copyrights
- holders
- authors
copyrights:
- "(c) .'2\x14'OTh'q\x04\x19deg^ A1 Co"
holders:
- "2\x14'OTh'q\x04\x19deg^ A1 Co"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,3 @@ what:
- copyrights
- holders
- authors
copyrights:
- (c) (c) SS
holders:
- SS
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,3 @@ what:
- copyrights
- holders
- authors
copyrights:
- (c) (c) Y
holders:
- Y
Loading
Loading