From ab24ba815bdf84d55df641c0d03d4dc93ad0453e Mon Sep 17 00:00:00 2001
From: Simon Suchomel <xsuchom1@anxur.fi.muni.cz>
Date: Thu, 30 May 2013 23:00:51 +0200
Subject: [PATCH] Ctvrtecni upravy

---
 pan13-paper/pan13-notebook.bib         |  5 +--
 pan13-paper/simon-source_retrieval.tex | 51 ++++++++++++++++++++++----
 2 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/pan13-paper/pan13-notebook.bib b/pan13-paper/pan13-notebook.bib
index 59c4aa0..204d2fc 100755
--- a/pan13-paper/pan13-notebook.bib
+++ b/pan13-paper/pan13-notebook.bib
@@ -12,7 +12,6 @@
         YEAR               = {2012}
 }
 
-<<<<<<< HEAD
 @inproceedings{suchomel_kas_12,
   added-at = {2012-10-01T11:37:58.000+0200},
   author = {Suchomel, {\v S}imon and Kasprzak, Jan and Brandejs, Michal},
@@ -85,7 +84,8 @@
         TITLE              = {Crowdsourcing Interaction Logs to Understand Text Reuse from the Web},
         URL                = {},
         YEAR               = {2013}
-=======
+}
+
 @INPROCEEDINGS{Kasprzak2009a,
   AUTHOR =       "Jan Kasprzak and Michal Brandejs and Miroslav K\v{r}ipa\v{c}",
   TITLE =        "Finding Plagiarism by Evaluating Document Similarities",
@@ -124,5 +124,4 @@
   booktitle={CLEF (Online Working Notes/Labs/Workshop)},
   pages={1--8},
   year={2012}
->>>>>>> 2278ad058d0a6e0c2228741c76aece9ace432912
 }
diff --git a/pan13-paper/simon-source_retrieval.tex b/pan13-paper/simon-source_retrieval.tex
index d5b338b..29e9781 100755
--- a/pan13-paper/simon-source_retrieval.tex
+++ b/pan13-paper/simon-source_retrieval.tex
@@ -73,7 +73,7 @@ The method combines term frequency analysis with TF-IDF score~\cite{Introduction
 corpus we used English web corpus~\cite{ententen} crawled by SpiderLink~\cite{SpiderLink} in 2012 which contains 4.65 billion tokens. 
 
 Each keywords based query were constructed from five top ranked keywords consecutively. Each keyword were
-used only in one query. Too long keywords based queries would be over-specific and it would have resulted
+used only in one query. Too long keywords based queries would be overspecific and it would have resulted
 in a low recall. On the other hand having constructed too short (one or two tokens) queries would have resulted
 in a low precision and also possibly low recall since they would be too general.
 
@@ -143,7 +143,7 @@ no more intentionally retrieval attempts for that part were effectuated. Meaning
 discovered search engine results were evaluated, but there were executed no more queries regarding that passage.
 
 
-\subsection{Result Selection}
+\subsection{Result Selection}~\label{resSelection}
 The second main decisive area about source retrieval task is to decide which from the search engine results to download.
 This process is represented in figure~\ref{fig:source_retr_process} as 'Selecting'. 
 Nowadays in real-world is download very cheap and quick operation. There can be some disk space considerations
@@ -157,11 +157,6 @@ operation. The snippet purpose is to have a quick glance at a small extract of r
 The extract is maximally 500 characters long and it is a portion of the document around given keywords.
 On the basis of snippet, we needed to decide whether to actually download the result or not.
 
-Since the snippet is relatively small and it can be discontinuous part of the text, the 
-text alignment methods described in section~\ref{text_alignment} were insufficient for 
-
-
-
 \subsection{Snippet Control}
 \begin{figure}
   \centering
@@ -169,6 +164,46 @@ text alignment methods described in section~\ref{text_alignment} were insufficie
   \caption{Downloads and similarities performance.}
   \label{fig:snippet_graph}
 \end{figure}
-\subsection{Source Retrieval Results}
 
+Since the snippet is relatively small and it can be discontinuous part of the text, the 
+text alignment methods described in section~\ref{text_alignment} were insufficient 
+in decision making over document download. Therefore we chose to compare existence of snippet word tuples
+in the suspicious document. For 1-tuples the measure means how many words from the snippet
+also exist in the suspicious document. If the snippet contains many common words they may
+also occur in many documents. In this case the 1-tuple measurement is little decisive. 
+
+We used 2-tuples measurement, which indicates how many neighbouring word pairs coexist in the snippet and in the suspicious document.
+We decided according to this value whether to download the source or not. For the deduction 
+ of the threshold value we used 4413 search results from various queries according to documents 
+ in the training corpus. Each resulting document was textually aligned to its corresponding suspicious document.
+One similarity represents continuous passage of text alignment similarity as is described in the following section~\ref{text_alignment}.
+In this way we obtained 248 similarities in total after downloading all of the 4431 documents.
+
+The 2-tuples similarity performance is depicted in figure~\ref{fig:snippet_graph}.
+Horizontal axis represents threshold of the 2-tuples similarity percentage between the snippet and the suspicious document.
+The graph curves represent obtain resource percentage according to the snippet similarity threshold.
+A profitable threshold is the one with the largest distance between those two curves.
+We chose threshold of the snippet similarity to 20\%, which in the graph corresponds to 20\% of all
+downloads and simultaneously with 70\% discovered similarities.
+ 
+\subsection{Source Retrieval Results}
+In PAN 2013 Source Retrieval subtask we competed with other 8 teams. 
+There can not be selected the best approach because there were several independent
+performance measures. Possibly each approach has its pros and cons and many approaches
+are usable in different situations. 
+
+We believe that in the realistic plagiarism detection the most important is keeping the number of
+queries low and simultaneously maximizing recall. 
+% It is often some tradeoff between cost and efectivness.
+It is also advisable to keep the number of downloads down, but on the other hand,
+it is relatively cheep and easily scalable operation.
+
+Our approach had the second best ration of recall to the number of used queries, which
+tells about the query efficacy. The approach with the best ratio used few queries (4.9 queries per document which
+was 0.4 of the amount we used), but also obtained the lowest recall (0.65 of our recall).
+The approach with highest recall (and also lowest precision) achieved 2.8 times higher recall with 3.9 times more queries.
+
+Our approach achieved also low precision, which means we reported many more results and they
+were not considered as correct hits. On the other hand each reported result contained some
+textual similarity according to text alignment subtask score, which we believe is still worthwhile to report.
 
-- 
2.43.0