Merge http://www.fi.muni.cz/~kas/git/pan13-paper

author Simon Suchomel <xsuchom1@anxur.fi.muni.cz>

Thu, 30 May 2013 15:55:04 +0000 (17:55 +0200)

committer Simon Suchomel <xsuchom1@anxur.fi.muni.cz>

Thu, 30 May 2013 15:55:04 +0000 (17:55 +0200)
author Simon Suchomel <xsuchom1@anxur.fi.muni.cz>
Thu, 30 May 2013 15:55:04 +0000 (17:55 +0200)
committer Simon Suchomel <xsuchom1@anxur.fi.muni.cz>
Thu, 30 May 2013 15:55:04 +0000 (17:55 +0200)
diff --git a/pan13-paper/img/snippets_graph.pdf b/pan13-paper/img/snippets_graph.pdf

new file mode 100755 (executable)

index 0000000..7441e98

Binary files /dev/null and b/pan13-paper/img/snippets_graph.pdf differ
diff --git a/pan13-paper/img/source_retrieval_process.pdf b/pan13-paper/img/source_retrieval_process.pdf

index bc4c6b9eaeda6bb5d0afa2ee0fa4521d4e6b638d..d275b61a8c9c7bb322472018342778672c10bcd1 100755 (executable)

Binary files a/pan13-paper/img/source_retrieval_process.pdf and b/pan13-paper/img/source_retrieval_process.pdf differ
diff --git a/pan13-paper/pan13-notebook.bib b/pan13-paper/pan13-notebook.bib

index e325b6f2034a30275643ee8c6e945ab4235d5781..59c4aa082237448f51861c2319e8ccdf33a5cbcb 100755 (executable)
--- a/pan13-paper/pan13-notebook.bib
+++ b/pan13-paper/pan13-notebook.bib
@@ -1,17 +1,91 @@
  @INPROCEEDINGS{chatnoir,\r
          AUTHOR             = {Martin Potthast and Matthias Hagen and Benno Stein and Jan Gra{\ss}egger and Maximilian Michel and Martin Tippmann and Clement Welsch},\r
          BOOKTITLE          = {35th International ACM Conference on Research and Development in Information Retrieval (SIGIR 12)},\r
-        DOI                = {},\r
+        DOI                = {http://dx.doi.org/10.1145/2348283.2348429},\r
          EDITOR             = {Bill Hersh and Jamie Callan and Yoelle Maarek and Mark Sanderson},\r
-        ISBN               = {},\r
+        ISBN               = {978-1-4503-1472-5},\r
          MONTH              = aug,\r
-        PAGES              = {},\r
-        PUBLISHER          = {},\r
+        PAGES              = {1004},\r
+        PUBLISHER          = {ACM},\r
          SITE               = {Portland, Oregon},\r
          TITLE              = {{ChatNoir: A Search Engine for the ClueWeb09 Corpus}},\r
          YEAR               = {2012}\r
  }\r
  \r
+<<<<<<< HEAD
+@inproceedings{suchomel_kas_12,\r
+  added-at = {2012-10-01T11:37:58.000+0200},\r
+  author = {Suchomel, {\v S}imon and Kasprzak, Jan and Brandejs, Michal},\r
+  bibsource = {DBLP, http://dblp.uni-trier.de},\r
+  biburl = {http://www.bibsonomy.org/bibtex/261d1f12dbeffef7de955e8cfa7cec167/promisenoe},\r
+  booktitle = {CLEF (Online Working Notes/Labs/Workshop)},\r
+  editor = {Forner, Pamela and Karlgren, Jussi and Womser-Hacker, Christa},\r
+  ee = {http://www.clef-initiative.eu/documents/71612/1f71592e-ad8a-4c84-833e-46a82b44a9be},\r
+  interhash = {07a49b465de0f0a1993d99d6afb51275},\r
+  intrahash = {61d1f12dbeffef7de955e8cfa7cec167},\r
+  isbn = {978-88-904810-3-1},\r
+  keywords = {dblp},\r
+  timestamp = {2012-10-01T11:37:58.000+0200},\r
+  title = {Three Way Search Engine Queries with Multi-feature Document Comparison for Plagiarism Detection.},\r
+  year = 2012\r
+}\r
+\r
+@book{Introduction_to_information_retrieval,\r
+  abstract = {Class-tested and coherent, this textbook teaches classical and web information retrieval, including web search and the related areas of text classification and text clustering from basic concepts. It gives an up-to-date treatment of all aspects of the design and implementation of systems for gathering, indexing, and searching documents; methods for evaluating systems; and an introduction to the use of machine learning methods on text collections. All the important ideas are explained using examples and figures, making it perfect for introductory courses in information retrieval for advanced undergraduates and graduate students in computer science. Based on feedback from extensive classroom experience, the book has been carefully structured in order to make teaching more natural and effective.},\r
+  added-at = {2012-05-30T10:50:27.000+0200},\r
+  address = {Cambridge, UK},\r
+  author = {Manning, Christopher D. and Raghavan, Prabhakar and Sch{\"u}tze, Hinrich},\r
+  biburl = {\url{http://www.bibsonomy.org/bibtex/28516d94c1f7aa1e391ddd3ace4caa23b/flint63}},\r
+  file = {Cambridge University Press Product Page:http\://www.cambridge.org/9780521865715:URL;Amazon Search inside:http\://www.amazon.de/gp/reader/0521865719/:URL;Google Books:http\://books.google.de/books?isbn=978-0-521-86571-5:URL},\r
+  PAGES              = {118-120},\r
+  groups = {public},\r
+  interhash = {b6954037b1d444f4afe4cad883b4d80c},\r
+  intrahash = {8516d94c1f7aa1e391ddd3ace4caa23b},\r
+  isbn = {978-0-521-86571-5},\r
+  keywords = {v1205 book ai information retrieval language processing search xml web},\r
+  publisher = {Cambridge University Press},\r
+  timestamp = {2012-05-30T10:50:27.000+0200},\r
+  title = {Introduction to Information Retrieval},\r
+  username = {flint63},\r
+  year = 2008\r
+}\r
+\r
+@MISC{ententen,\r
+ key = "{Corpus}",\r
+ title = "{Sketch Engine EnTenTen Corpus}",\r
+ howpublished = "\url{http://trac.sketchengine.co.uk/wiki/Corpora/enTenTen}",\r
+ year = "2012", \r
+}\r
+\r
+@inproceedings{spiderLink,\r
+  author = {Suchomel, V. and Pomik{\'a}lek, J.},\r
+  booktitle = {Proceedings of the seventh Web as Corpus Workshop (WAC7)},\r
+  pages  = {39-43},\r
+  editor = {Adam Kilgarriff and Serge Sharoff},\r
+  title = {Efficient Web Crawling for Large Text Corpora},\r
+  year = 2012\r
+}\r
+\r
+@INPROCEEDINGS{awfc,\r
+    author = {Sven Meyer Zu Eissen and Benno Stein},\r
+    title = {Intrinsic Plagiarism Detection},\r
+    booktitle = {Proceedings of the European Conference on Information Retrieval (ECIR-06)},\r
+    year = {2006}\r
+}\r
+\r
+@INPROCEEDINGS{plagCorpus,\r
+        AUTHOR             = {Martin Potthast and Matthias Hagen and Michael V{\"o}lske and Benno Stein},\r
+        BOOKTITLE          = {51st Annual Meeting of the Association of Computational Linguistics (ACL 13) -- (to appear) },\r
+        DOI                = {http://dx.doi.org/},\r
+        EDITOR             = {},\r
+        MONTH              = aug,\r
+        PAGES              = {},\r
+        PUBLISHER          = {ACM},\r
+        SITE               = sofia,\r
+        TITLE              = {Crowdsourcing Interaction Logs to Understand Text Reuse from the Web},\r
+        URL                = {},\r
+        YEAR               = {2013}\r
+=======
  @INPROCEEDINGS{Kasprzak2009a,\r
    AUTHOR =       "Jan Kasprzak and Michal Brandejs and Miroslav K\v{r}ipa\v{c}",\r
    TITLE =        "Finding Plagiarism by Evaluating Document Similarities",\r
@@ -50,4 +124,5 @@
    booktitle={CLEF (Online Working Notes/Labs/Workshop)},\r
    pages={1--8},\r
    year={2012}\r
+>>>>>>> 2278ad058d0a6e0c2228741c76aece9ace432912
  }\r
diff --git a/pan13-paper/pan13-notebook.pdf b/pan13-paper/pan13-notebook.pdf

deleted file mode 100644 (file)

index cbde3e1..0000000

Binary files a/pan13-paper/pan13-notebook.pdf and /dev/null differ
diff --git a/pan13-paper/simon-source_retrieval.aux b/pan13-paper/simon-source_retrieval.aux

deleted file mode 100644 (file)

index 1648a02..0000000
--- a/pan13-paper/simon-source_retrieval.aux
+++ /dev/null
@@ -1,49 +0,0 @@
-\relax 
-\citation{chatnoir}
-\@writefile{toc}{\contentsline {section}{\numberline {2}Source Retrieval}{2}}
-\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Source retrieval process.}}{2}}
-\newlabel{fig:source_retr_process}{{1}{2}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Querying}{3}}
-\@writefile{toc}{\contentsline {subsubsection}{Keywords Based Queries}{3}}
-\@writefile{toc}{\contentsline {subsubsection}{Intrinsic Plagiarism Based Queries}{3}}
-\@writefile{toc}{\contentsline {subsubsection}{Paragraph Based Queries}{3}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Search Control}{3}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Result Selection}{3}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Snippet Control}{3}}
-\@setckpt{simon-source_retrieval}{
-\setcounter{page}{4}
-\setcounter{equation}{0}
-\setcounter{enumi}{0}
-\setcounter{enumii}{0}
-\setcounter{enumiii}{0}
-\setcounter{enumiv}{0}
-\setcounter{footnote}{2}
-\setcounter{mpfootnote}{0}
-\setcounter{part}{0}
-\setcounter{section}{2}
-\setcounter{subsection}{4}
-\setcounter{subsubsection}{0}
-\setcounter{paragraph}{0}
-\setcounter{subparagraph}{0}
-\setcounter{figure}{1}
-\setcounter{table}{0}
-\setcounter{chapter}{1}
-\setcounter{@inst}{1}
-\setcounter{@auth}{3}
-\setcounter{auco}{3}
-\setcounter{theorem}{0}
-\setcounter{case}{0}
-\setcounter{conjecture}{0}
-\setcounter{corollary}{0}
-\setcounter{definition}{0}
-\setcounter{example}{0}
-\setcounter{exercise}{0}
-\setcounter{lemma}{0}
-\setcounter{note}{0}
-\setcounter{problem}{0}
-\setcounter{property}{0}
-\setcounter{proposition}{0}
-\setcounter{question}{0}
-\setcounter{solution}{0}
-\setcounter{remark}{0}
-}
diff --git a/pan13-paper/simon-source_retrieval.tex b/pan13-paper/simon-source_retrieval.tex

index b3289c9fe6beb2c3e0f1e7e818d68212a2849462..d5b338b948a6cc4a13fac8319d8bb51328861b7e 100755 (executable)
--- a/pan13-paper/simon-source_retrieval.tex
+++ b/pan13-paper/simon-source_retrieval.tex
@@ -5,9 +5,9 @@ large corpus. Those candidate documents are usually further compared in detail w
  suspicious document. In the PAN 2013 source retrieval subtask the main goal was to\r
  identified web pages which have been used as a source of plagiarism for creation of the \r
  test corpus. \r
-The test corpus contained XX documents each discussing one and only one theme.\r
+The test corpus contained 58 documents each discussing one and only one theme.\r
  Those documents were created intentionally by\r
- semiprofessional writers, thus they feature nearly realistic plagiarism cases. \r
+ semiprofessional writers, thus they feature nearly realistic plagiarism cases~\cite{plagCorpus}. \r
   Such conditions are similar to a realistic plagiarism detection scenario, such as for\r
  state of the art commercial plagiarism detection systems or the anti-plagiarism service developed on and\r
  utilized at the Masaryk University. The main difference between real-world corpus \r
@@ -33,7 +33,8 @@ In real-world scenario the corpus is the whole Web and the search engine can be
  which scales to the size of the Web. This methodology is based on the fact that we do not\r
  possess enough resources to download and effectively process the whole corpus.\r
  In the case of PAN 2013 competition the corpus\r
-of source documents is the ClueWeb~\footnote{\url{http://lemurproject.org/clueweb09.php/}} corpus. \r
+of source documents is the ClueWeb\footnote{\url{http://lemurproject.org/clueweb09.php/}} corpus.\r
+\r
  As a document retrieval tool for the competition we utilized the ChatNoir~\cite{chatnoir} search engine which indexes the English\r
  subset of the ClueWeb.   \r
  The reverse engineering decision process reside in creation of suitable queries on the basis of the suspicious document\r
@@ -44,15 +45,16 @@ from the search engine are forthwith textually aligned with the suspicious docum
  This is the last decision phase -- what to report.\r
  If there is any continuous passage of reused text detected, the result document is reported\r
   and the continuous passages in the suspicious document are marked as 'discovered' and no further processing\r
-of those parts is made. \r
+of those parts is done. \r
   \r
  \subsection{Querying}\r
  Querying means to effectively utilize the search engine in order to retrieve as many relevant\r
  documents as possible with the minimum amount of queries. We consider the resulting document relevant \r
-if it shares some of text characteristics with the suspicious document.  \r
+if it shares some of text characteristics with the suspicious document. In real-world queries as such\r
+represent appreciable cost, therefore their minimization should be one of the top priorities.\r
  \r
-We used 3 different types of queries~\footnote{We used similar three-way based methodology in PAN 2012 \r
-Candidate Document Retrieval subtask. However this time we completely replaced the headers based queries\r
+We used 3 different types of queries\footnote{We used similar three-way based methodology in PAN 2012 \r
+Candidate Document Retrieval subtask. However, this time we completely replaced the headers based queries\r
  with paragraph based queries, since the headers based queries did not pay off in the overall process.}:\r
  i) keywords based queries, ii) intrinsic plagiarism\r
  based queries, and iii) paragraph based queries. Three main properties distinguish each type of query: i) Positional; ii) Phrasal; iii) Deterministic.\r
@@ -61,18 +63,112 @@ A phrasal query aims for retrieval of documents containing the same small piece
  Deterministic queries for specific suspicious document are always the same no matter how many times we run the software. \r
  On the contrary the software can create in two runs potentially different nondeterministic queries.\r
  \r
-\subsubsection{Keywords Based Queries}\r
+\subsubsection{Keywords Based Queries.}\r
+The keywords based queries compose of automatically extracted keywords from the whole suspicious document.\r
+Their purpose is to retrieve documents concerning the same theme. Two documents discussing the \r
+same theme usually share a set of overlapping keywords. Also the combination of keywords in\r
+query matters. \r
+As a method for automated keywords extraction, we used a frequency based approach described in~\cite{suchomel_kas_12}.\r
+The method combines term frequency analysis with TF-IDF score~\cite{Introduction_to_information_retrieval}. As a reference\r
+corpus we used English web corpus~\cite{ententen} crawled by SpiderLink~\cite{SpiderLink} in 2012 which contains 4.65 billion tokens. \r
+\r
+Each keywords based query were constructed from five top ranked keywords consecutively. Each keyword were\r
+used only in one query. Too long keywords based queries would be over-specific and it would have resulted\r
+in a low recall. On the other hand having constructed too short (one or two tokens) queries would have resulted\r
+in a low precision and also possibly low recall since they would be too general.\r
+\r
+In order to direct the search more at the highest ranked keywords we also extracted their \r
+most frequent two and three term long collocations. These were combined also into queries of 5 words.\r
+Resulting the 4 top ranked keywords alone can appear in two different queries, one from the keywords\r
+alone and one from the collocations. Collocation describes its keyword better than the keyword alone. \r
+\r
+The keywords based queries are non-positional, since they represent the whole document. They are also non-phrasal since\r
+they are constructed of tokens gathered from different parts of the text. And they are deterministic, for certain input\r
+document the extractor always returns the same keywords.\r
+\r
+\subsubsection{Intrinsic Plagiarism Based Queries.}\r
+The second type of queries purpose to retrieve pages which contain similar text detected\r
+as different, in a manner of writing style, from other parts of the suspicious document.\r
+Such a change may point out plagiarized passage which is intrinsically bound up with the text.  \r
+We implemented vocabulary richness method which computes average word frequency class value for \r
+a given text part. The method is described in~\cite{awfc}. The problem is that generally methods\r
+based on the vocabulary statistics work better for longer texts. According to authors this method\r
+scales well for shorter texts than other text style detection methods. \r
+Still the usage is in our case limited by relatively short texts. It is also difficult to determine\r
+what parts of text to compare. Therefore we used sliding window concept for text chunking with the \r
+same settings as described in~\cite{suchomel_kas_12}.\r
+\r
+A representative sentence longer than 6 words was randomly selected among those that apply from the suspicious part of the document.\r
+An intrinsic plagiarism based query is created from the representative sentence leaving out stop words.\r
+\r
+The intrinsic plagiarism based queries are positional. They carry the position of the representative sentence in the document.\r
+They are phrasal, since they represent a search for a specific sentence. And they are\r
+nondeterministic, because the representative sentence is selected randomly. \r
+ \r
+\subsubsection{Paragraph Based Queries.}\r
+The purpose of paragraph based queries is to check some parts of the text in more depth.\r
+Parts for which no similarity has been found during previous searches. \r
+\r
+For this case we considered a paragraph as a minimum text chunk for plagiarism to occur. \r
+It is discussible whether a plagiarist would be persecuted for plagiarizing only one sentence in a paragraph.\r
+Also a detection of a specific sentence is very difficult if want to avoid exhaustive search approach.\r
+If someone is to reuse some peace of continuous text, it would probably be no shorter than a paragraph. \r
+Despite the fact, that paragraphs differ in length, we represent one paragraph by one query.\r
+\r
+\r
+The paragraph based query was created from each paragraph of a suspicious document.\r
+From each paragraph we extracted the longest sentence from which the query was constructed.\r
+Ideally the extracted sentence should carry the highest information gain.\r
+The query was maximally 10 words in length which is the upper bound of ChatNoir\r
+and was constructed from the selected sentence by omitting stop words.\r
  \r
-\subsubsection{Intrinsic Plagiarism Based Queries}\r
-\subsubsection{Paragraph Based Queries}\r
  \subsection{Search Control}\r
+For each suspicious document we prepared all three types of queries during the first phase at once.\r
+Queries were executed stepwise. \r
+After processing each query the results were evaluated (see the following subsection~\ref{resSelection} for more details) and from\r
+all textual similarities between each result and the suspicious document, the suspicious document intervals of those similarities\r
+were marked as 'discovered'. \r
+At first the keywords based queries. All of the keywords based queries were\r
+always executed. \r
+After having all the keywords based queries processed, the intrinsic plagiarism based queries were executed according to \r
+their creation sequence. \r
+Since they carry its position not all of the intrinsic plagiarism based queries were caried out.\r
+During the execution, if any of the query position intersected with any of the 'discovered' interval, the\r
+query was dropped out. In the same way, the last paragraph based queries were processed. \r
+\r
+This search control results in two major properties. Firstly, the source retrieval effort were increased \r
+in parts of the suspicious document, where there have not yet been found any textual similarity.\r
+Especially by the paragraph based queries. And secondly, after detection a similarity for a certain part of the text,\r
+no more intentionally retrieval attempts for that part were effectuated. Meaning that all\r
+discovered search engine results were evaluated, but there were executed no more queries regarding that passage.\r
  \r
  \r
  \subsection{Result Selection}\r
-\subsection{Snippet Control}\r
+The second main decisive area about source retrieval task is to decide which from the search engine results to download.\r
+This process is represented in figure~\ref{fig:source_retr_process} as 'Selecting'. \r
+Nowadays in real-world is download very cheap and quick operation. There can be some disk space considerations\r
+if there is a need to store original downloaded documents. The main cost represents documents post processing. \r
+Mainly on the Internet there is a wide range of file formats, which for text alignment must be\r
+converted into plaintext. This can time and computational-consuming. For example from many\r
+pdf documents the plain text is hardly extractable, thus one need to use optical character recognition methods.\r
  \r
+The ChatNoir offers snippets for discovered documents. The snippet generation is considered costless\r
+operation. The snippet purpose is to have a quick glance at a small extract of resulting page.\r
+The extract is maximally 500 characters long and it is a portion of the document around given keywords.\r
+On the basis of snippet, we needed to decide whether to actually download the result or not.\r
+\r
+Since the snippet is relatively small and it can be discontinuous part of the text, the \r
+text alignment methods described in section~\ref{text_alignment} were insufficient for \r
  \r
- \r
  \r
  \r
+\subsection{Snippet Control}\r
+\begin{figure}\r
+  \centering\r
+  \includegraphics[width=1.00\textwidth]{img/snippets_graph.pdf}\r
+  \caption{Downloads and similarities performance.}\r
+  \label{fig:snippet_graph}\r
+\end{figure}\r
+\subsection{Source Retrieval Results}\r
+\r
  \r
diff --git a/pan13-paper/yenya-dtext_alignment.aux b/pan13-paper/yenya-dtext_alignment.aux

deleted file mode 100644 (file)

index ed3393a..0000000
--- a/pan13-paper/yenya-dtext_alignment.aux
+++ /dev/null
@@ -1,38 +0,0 @@
-\relax 
-\@setckpt{yenya-dtext_alignment}{
-\setcounter{page}{3}
-\setcounter{equation}{0}
-\setcounter{enumi}{0}
-\setcounter{enumii}{0}
-\setcounter{enumiii}{0}
-\setcounter{enumiv}{0}
-\setcounter{footnote}{0}
-\setcounter{mpfootnote}{0}
-\setcounter{part}{0}
-\setcounter{section}{2}
-\setcounter{subsection}{0}
-\setcounter{subsubsection}{0}
-\setcounter{paragraph}{0}
-\setcounter{subparagraph}{0}
-\setcounter{figure}{0}
-\setcounter{table}{0}
-\setcounter{chapter}{1}
-\setcounter{@inst}{1}
-\setcounter{@auth}{3}
-\setcounter{auco}{3}
-\setcounter{theorem}{0}
-\setcounter{case}{0}
-\setcounter{conjecture}{0}
-\setcounter{corollary}{0}
-\setcounter{definition}{0}
-\setcounter{example}{0}
-\setcounter{exercise}{0}
-\setcounter{lemma}{0}
-\setcounter{note}{0}
-\setcounter{problem}{0}
-\setcounter{property}{0}
-\setcounter{proposition}{0}
-\setcounter{question}{0}
-\setcounter{solution}{0}
-\setcounter{remark}{0}
-}
author	Simon Suchomel <xsuchom1@anxur.fi.muni.cz>
	Thu, 30 May 2013 15:55:04 +0000 (17:55 +0200)
committer	Simon Suchomel <xsuchom1@anxur.fi.muni.cz>
	Thu, 30 May 2013 15:55:04 +0000 (17:55 +0200)
pan13-paper/img/snippets_graph.pdf	[new file with mode: 0755]	patch \| blob
pan13-paper/img/source_retrieval_process.pdf		patch \| blob \| history
pan13-paper/pan13-notebook.bib		patch \| blob \| history
pan13-paper/pan13-notebook.pdf	[deleted file]	patch \| blob \| history
pan13-paper/simon-source_retrieval.aux	[deleted file]	patch \| blob \| history
pan13-paper/simon-source_retrieval.tex		patch \| blob \| history
pan13-paper/yenya-dtext_alignment.aux	[deleted file]	patch \| blob \| history