From f60109308b38984bfe852f58aea209863e482ff3 Mon Sep 17 00:00:00 2001
From: Simon Suchomel <xsuchom1@anxur.fi.muni.cz>
Date: Mon, 13 Aug 2012 18:02:42 +0200
Subject: [PATCH] Pondelni psani.

---
 paper.bib              |  36 +++++++++++++
 paper.tex              |  26 +++++++--
 simon-searchengine.tex | 118 ++++++++++++++++++++++++++++++++++++++---
 yenya-detailed.tex     |   2 +-
 4 files changed, 171 insertions(+), 11 deletions(-)
 mode change 100644 => 100755 paper.bib
 mode change 100644 => 100755 yenya-detailed.tex

diff --git a/paper.bib b/paper.bib
old mode 100644
new mode 100755
index c4e9b52..97666d3
--- a/paper.bib
+++ b/paper.bib
@@ -1,3 +1,39 @@
+@INPROCEEDINGS{chatnoir,
+        AUTHOR             = {Martin Potthast and Matthias Hagen and Benno Stein and Jan Gra{\ss}egger and Maximilian Michel and Martin Tippmann and Clement Welsch},
+        BOOKTITLE          = {35th International ACM Conference on Research and Development in Information Retrieval (SIGIR 12)} { (to appear) },
+        DOI                = {},
+        EDITOR             = {Bill Hersh and Jamie Callan and Yoelle Maarek and Mark Sanderson},
+        ISBN               = {},
+        MONTH              = aug,
+        PAGES              = {},
+        PUBLISHER          = {},
+        SITE               = {Portland, Oregon},
+        TITLE              = {{ChatNoir: A Search Engine for the ClueWeb09 Corpus}},
+        YEAR               = {2012}
+}
+
+@BOOK{text_patterns,
+  author = "{Mike Scott and Christopher Tribble}",
+	title = "{Textual Patterns, Key words and corpus analysis in language education}",
+	PAGES              = {55-72},
+	publisher = "{John Benjamins Publishing Company}",
+	year = "2006"
+}
+
+@MISC{ententen,
+ key = "{Corpus}",
+ title = "{Sketch Engine EnTenTen corpus}",
+ howpublished = "\url{http://trac.sketchengine.co.uk/wiki/Corpora/enTenTen}",
+ year = "2012", 
+}
+
+@INPROCEEDINGS{Knight,
+    author = {Allan Knight and Kevin Almeroth and Bruce Bimber},
+    title = {An Automated System for Plagiarism Detection Using the Internet},
+    booktitle = {Proceedings of World Conference on Educational Multimedia, Hypermedia and Telecommunications, pg. 3619-3625},
+    year = {2004},
+}
+
 
 @INPROCEEDINGS{Kasprzak2008,
   AUTHOR =       "Jan Kasprzak and Michal Brandejs and Miroslav K\v{r}ipa\v{c} and Pavel {\v S}merk",
diff --git a/paper.tex b/paper.tex
index e098f4a..92e4a39 100755
--- a/paper.tex
+++ b/paper.tex
@@ -8,7 +8,7 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{document}
 
-\title{Your Title}
+\title{Three way search engine queries with multi-feature document comparison for plagiarism detection}
 %%% Please do not remove the subtitle.
 \subtitle{Notebook for PAN at CLEF 2012}
 
@@ -28,8 +28,20 @@ Briefly describe the main ideas of your approach.
 %The notebooks shall contain a full write-up of your approach, including all details necessary to reproduce your results.
 
 
-Due to the increasing ease of plagirism the plagiarism detection has nowdays become a need for many instutisions. Especially for universities where modern learning methods include e-learning and a vast document sources are online available.  
-
+Due to the increasing ease of plagiarism the plagiarism detection has nowadays become a need for many institutions.
+Especially for universities where modern learning methods include e-learning and a vast document sources are online available.
+In the Information System of Masaryk University there is also an antiplagiarism tool which is based upon the same principles as are shown in this paper.
+The core methods for automatic plagiarism detection, which also work in practice on extensive collections of documents,
+are based on computation document similarities. In order to compute a similarity
+we need to possess the original and the plagiarized document.
+The most straightforward method is to use an online search engine in order to enrich
+document base with potential plagiarized documents and evaluate the amount of plagiarism by detailed document comparison. 
+In this paper we introduce a method which has been used in PAN 2012 competition\footnote{\url{http://pan.webis.de/}}
+in plagiarism detection.
+In the first section we described our aproach to retrieve candidate documents for detailed document comparison from online sources.
+ 
+The next section describes used methods of computation document similarities.
+We also discuss the performance ...
 
 
 
@@ -38,8 +50,14 @@ Due to the increasing ease of plagirism the plagiarism detection has nowdays bec
 
 \section{Conclusions}
 
-Tady napsat zaver
+We have presented methods for candidate document retrieval which has led to
+discovery the decent amount of plagiarism with minimizing the number of used queries.   
+
+We have created three main types of queries: keywords based, intrinsic plagiarism based and headers based.
+....
+%We distinguish two properties of queries: positionable, conditionally executable  
 
+....
 \bibliographystyle{splncs03}
 \begin{raggedright}
 \bibliography{paper}
diff --git a/simon-searchengine.tex b/simon-searchengine.tex
index f332ba5..e50f3ef 100755
--- a/simon-searchengine.tex
+++ b/simon-searchengine.tex
@@ -1,10 +1,116 @@
-\section{Search-Engine Queries}
+\section{Candidate document retrieval}~\label{simon}
+The basic concept of candidate document retrieval is to use a web search
+engine to find suitable documents. In PAN 12 competition we used ChatNoir search
+engine~\cite{chatnoir} which indexes the entire English part of the ClueWeb09 corpus.
+We can now reduce the problem to constructing appropriate queries for ChatNoir search engine.
+The goal is to retrieve similar documents, which contains the same passages as the source document
+or similar documents dealing with the same theme.
+The strongest emphasis has been places on minimizing the number of executed queries
+since they may be charged or time limited or number limited in the real world. 
 
-Plagiarism in the given suspicious documents should be found via  the
-ChatNoir search engine~\cite{chatnoir}.
-In order to achieve relevant results we need to construct appropriate queries,  which are then sequentially executed.% according to their priority.
+In the PAN 12 test corpus, there were 32 text documents all contained at least one plagiarism case.
+The documents were approximately 30 KB of size, the smallest were 18 KB and the largest were 44 KB.
+We have used methods which are applicable in general to any type of text input with no apriori information about the input document.
 
-We can divide constructed queries into three main groups according to  their origin:
- i) document keywords based, ii) intrinsic plagiarism based, and iii) headers based. 
+All queries from a given document were firstly preconstructed and they were afterwards sequentially executed
+according to their priority. After each executed query the intermediate results of plagiarism 
+were calculated using the multi feature document comparison techniques described in section~\ref{yenya}.
+This process leads to continuing lowering the number of executed queries, for more details see the following passages.
 
 
+We can divide constructed queries into three main groups according to their origin:
+ i) document keywords based, ii) intrinsic plagiarism based, and iii) headers based.
+%Please note two major attributes of the extracted keywords. Firstly we say   
+ 
+\subsection{Keywords based queries}
+The document keywords based queries are constructed from automatically extracted keywords from the given document.
+As for keyword extraction we used methodology based on word repetition similar to method described by Scott~\cite{text_patterns}.
+The basic principle claims that the more frequent a single word in a given document is, the more likely it is to be key in it.
+However the most frequent words usually do not carry any meaning.
+For example articles (the), conjunctions (as) or prepositions (at).
+These types of words are usually refered as the stop words. 
+Thus we firstly filter out stop words according to general English stop words list.
+Secondly  we used reference corpus word list as an additional filter.
+We used words in their simple verbatim occurrence allied to a
+statistical estimate of likelihood. As a result of this a word to become a key
+must not exists in the stop list and must occur in the suspicious document relatively much more frequently
+than in the reference corpus. We used a corpus created from common English web sources, which contains
+more then 4 billion tokens~\cite{ententen}.
+The statistical measure TF-IDF was applied in order to decide whether the processed word shall become a keyword.
+The TF-IDF combines term frequency (TF) and inverse document frequency (IDF) to produce composite weight of a given term.
+Taking into account the average length of the suspicious document and minimizing the number of used queries we set TF-IDF weight threshold to 0.02,
+which resulted in about 10 to 20 terms identified as keywords per document.
+
+Before executing web search tasks we must combine extracted keywords into befitting queries.
+All queries were constructed from 5 keywords according to their TF-IDF weight consecutively.
+In other words 5 top ranked keywords combine into the first query the other 5 top ranked keywords combine into the second query.
+By using this method we usually obtained from 2 to 3 initial queries per document.
+
+\subsubsection{Collocations}
+In addition to this we enhanced subsequent queries by the most frequent collocations of top 4 keywords.
+For those 4 keywords we extracted the most frequent two-word and three-word collocation occurring within the single suspicious document.
+A scope for collocates was a single sentence. We counted collocates as the most frequent neighbouring words within the scope also omitting stop words.
+To take advantage of extracted collocations we combined them among selfs also into 5 word queries.
+After omitting queries containing duplicated words we added, on average, two new queries to each suspicious document.
+Despite of being composed from collocations we count those queries also as the keywords based. Together with
+the queries created only from bare keywords, those queries were unconditionally executed as the initial queries. \\
+
+Queries containing around 5 words should be optimal in order to retrieve highest mean average precision of results.
+It means we would like to extract from results of those queries as many documents as possible but still dealing
+with the similar theme as the source document. Expecting the order of words within a single query has small
+impact on results, no other heuristics of combining keywords into queries were applied also
+in order to keep the query amount minimal. In 32 from 33 input test documents were more than a hundred resulting documents found using
+those initial queries, which gave a decent document base covering the source document theme for document comparison. 
+
+
+
+%using less kw in q would result into more general or underspecifi   
+  
+All those previously mentioned queries characterize the document as a whole.
+They should cover the theme of the document expecting the single document is actually dealing with a one theme. 
+
+\subsection{Intrinsic plagiarism based queries}
+Intrinsic plagiarism queries were constructed from a representative sentence residing in
+the selected part of the document. Representative sentence is any sentence from the selected part of the document
+which is greater then eight words in length.
+Such a sentence should produce the least searching false-positives~\cite{knight}.
+
+The main task concerning representative sentences is to locate the suitable text part.
+This part should be located by a method leading to discover potential plagiarism inside the document of its own,
+which is called intrinsic plagiarism detection. Such a method is based on for example text statistics or syntactic features.
+We chose statistical vocabulary richness method called average word frequency class described in~\cite{awfc}.
+Using this method we can compute average word frequency class values (AWFC) for arbitrary parts of the text. The change
+of this value between two adjacent text windows indicates change of writing style, which can be caused by a plagiarism.
+We computed AWFC value for every text window of size 45 words, which was shifted through the text by 40 words span.
+The values of window size and span of shifting was set empirically during training. Windows with
+largest change of AWFC compared to AWFC of their neighbouring previous window were selected as candidate for query extraction.
+
+
+The query from the representative sentence were composed by 6 words from the sentence beginning, also omitting stop words.
+We chose 6 words to cover higher portion of selected sentence and not to exceed 
+the phrasal ChatNoir query limit.
+This type of query is actually looking for documents containing the very same or at least the very similar sentence,
+therefore we choose more than 5 word queries in order to narrow the search to more specific 
+results. Unfortunately in the time of PAN 12 competition the ChatNoir search engine did not
+support phrasal search. Since querying a single sentence is phrasal search, one might have
+expected even better results from those queries.
+ %Due to the fact, that ChatNoir does not currently support phrasal search,
+ 
+The intrinsic plagiarism based queries are document positionable, meaning that we can store
+their position within the document. It is due to the fact, that 
+those queries are created from more or less subsequent words. As a result of that they are also executed conditionally, for more
+information see section~\ref{process}.
+The position within the document were set to the
+start position of the representative sentence. 
+They characterize only a specific portion of the suspicious document on the contrary to keyword based queries.
+
+\subsection{Headers based queries}
+The last type of queries are queries constructed from local headers of document.
+The header usually characterize briefly and accurately following section, thus it
+can be used as a search query. We considered headers only from 2 to 6 words in length.
+
+
+\subsection{Input document processing}~\label{process}
+
+\subsection{Queries comparison}
+
diff --git a/yenya-detailed.tex b/yenya-detailed.tex
old mode 100644
new mode 100755
index f2dd93f..9f40346
--- a/yenya-detailed.tex
+++ b/yenya-detailed.tex
@@ -1,4 +1,4 @@
-\section{Detailed Document Comparison}
+\section{Detailed Document Comparison}~\label{yenya}
 
 \label{detailed}
 
-- 
2.43.0