]> www.fi.muni.cz Git - pan13-paper.git/commitdiff
1. verze hotove Simonovy casti
authorSimon Suchomel <xsuchom1@anxur.fi.muni.cz>
Thu, 19 Sep 2013 13:19:02 +0000 (15:19 +0200)
committerSimon Suchomel <xsuchom1@anxur.fi.muni.cz>
Thu, 19 Sep 2013 13:19:02 +0000 (15:19 +0200)
pan13-poster/img/document_awfc.pdf
pan13-poster/img/document_keywords.pdf [new file with mode: 0755]
pan13-poster/img/document_paragraphs.pdf [new file with mode: 0755]
pan13-poster/img/queryprocess.pdf [new file with mode: 0755]
pan13-poster/poster.tex

index 71e5eff24ad5ead635929ff6c835515cff6fed80..0a48308dcf5b357cfa80d9e315ae3fd9f64ccc11 100755 (executable)
Binary files a/pan13-poster/img/document_awfc.pdf and b/pan13-poster/img/document_awfc.pdf differ
diff --git a/pan13-poster/img/document_keywords.pdf b/pan13-poster/img/document_keywords.pdf
new file mode 100755 (executable)
index 0000000..f60baf6
Binary files /dev/null and b/pan13-poster/img/document_keywords.pdf differ
diff --git a/pan13-poster/img/document_paragraphs.pdf b/pan13-poster/img/document_paragraphs.pdf
new file mode 100755 (executable)
index 0000000..38c4372
Binary files /dev/null and b/pan13-poster/img/document_paragraphs.pdf differ
diff --git a/pan13-poster/img/queryprocess.pdf b/pan13-poster/img/queryprocess.pdf
new file mode 100755 (executable)
index 0000000..e6d8a1a
Binary files /dev/null and b/pan13-poster/img/queryprocess.pdf differ
index 42987ac9382911afc72bf535f6f37347ffc51ab8..5e3c9a095b4e30283e5c9b63b798d7325becc268 100755 (executable)
 \r
 \r
 \begin{multicols}{2}\setlength{\columnseprule}{0pt}\r
-\r
-\r
 \section{Introduction}\r
-\r
+%\r
 PAN 2013 LOrem ipsum Lorem ipsum Lorem ipsumLorem ipsumLorem ipsumLorem ipsumLorem ipsum \r
-\r
-\r
+%\r
 \vfill\r
 \columnbreak\r
-\r
+%\r
 \begin{figure}\r
  \centering\r
-  \includegraphics[width=0.8\textwidth]{img/source_retrieval_process.pdf}\r
+  \includegraphics[width=0.6\textwidth]{img/source_retrieval_process.pdf}\r
   \caption{Plagiarism discovery process.}\r
   \label{fig:process}\r
 \end{figure} \r
-\r
-\r
 \end{multicols}\r
-\r
-\r
-\r
 \begin{multicols}{2}\r
-\r
 %\rm\r
-\r
 %%% Introduction\r
 \section{Querying}\r
 Querying means to effectively utilize the search engine in order to retrieve as many relevant\r
 documents as possible with the minimum amount of queries.\r
 %We consider the resulting document relevantif it shares some of text characteristics with the suspicious document.\r
-In real-world queries as such represent appreciable cost, therefore their minimization should be one of the top priorities. \\\r
-\subsection{Types of Queries}\r
-From the suspicious document, there were three diverse types of queries extracted.\r
-\subsubsection{Keywords Based Queries}\r
+In real-world queries as such represent appreciable cost, therefore their minimization should be one of the top priorities. \r
+%\subsection{Types of Queries}\r
+From the suspicious document, there were three diverse types of queries extracted.\\\r
+\begin{minipage}{0.55\linewidth}\r
+\subsection{Keywords Based Queries}\r
 \begin{ytemize}\r
 \item TF--IDF base automated keywords extraction;\r
 \item 5-token long; \r
@@ -158,9 +149,15 @@ From the suspicious document, there were three diverse types of queries extracte
 \item Non-positional;\r
 \item Non-phrasal.\r
 \end{ytemize}\r
-\r
+\end{minipage}\r
+\begin{minipage}{0.45\linewidth}\r
+\begin{figure}[h]\r
+ %\centering\r
+  \includegraphics[width=1\linewidth]{img/document_keywords.pdf}\r
+\end{figure}\r
+\end{minipage}\r
 \begin{minipage}{0.55\linewidth}\r
-\subsubsection{Intrinsic Plagiarism Based Queries}\r
+\subsection{Intrinsic Plagiarism Based Queries}\r
 \begin{ytemize}\r
 \item Averaged Word Frequency Class based chunking~\cite{AWFC};\r
 \item Random sentence selection from the chunk;\r
@@ -175,16 +172,35 @@ From the suspicious document, there were three diverse types of queries extracte
   \includegraphics[width=1\linewidth]{img/document_awfc.pdf}\r
 \end{figure}\r
 \end{minipage}\r
-\r
-\subsubsection{Paragraph Based Queries}\r
+\begin{minipage}{0.55\linewidth}\r
+\subsection{Paragraph Based Queries}\r
 \begin{ytemize}\r
 \item Longest sentences from miscellaneous paragraphs;\r
 \item Deterministic;\r
 \item Positional;\r
 \item Phrasal.\r
 \end{ytemize}\r
+\end{minipage}\r
+\begin{minipage}{0.45\linewidth}\r
+\begin{figure}[h]\r
+ %\centering\r
+  \includegraphics[width=1\linewidth]{img/document_paragraphs.pdf}\r
+\end{figure}\r
+\end{minipage}\r
+\r
+\begin{figure}[h]\r
+ \centering\r
+  \includegraphics[width=0.8\linewidth]{img/queryprocess.pdf}\r
+   \caption{Stepwise queries execution process.}\r
+\end{figure}\r
 \r
 \section{Selecting}\r
+Document snippets were used for deciding whether to download the document for the text alignment.\r
+We used 2-tuples measurement, which indicates how many neighbouring word pairs coexist in the snippet and in the suspicious document.\r
+Performance of this measure is depicted at picture~\ref{fig:snippet_graph}.\r
+Having this measure, a threshold for download decision needs to be set in order to maximize all discovered similarities\r
+and minimize total downloads.\r
+A profitable threshold is such that matches with the largest distance between those two curves.\r
 \begin{figure}\r
   \centering\r
   \includegraphics[width=0.8\textwidth]{img/snippets_graph.pdf}\r
@@ -192,6 +208,7 @@ From the suspicious document, there were three diverse types of queries extracte
   \label{fig:snippet_graph}\r
 \end{figure}\r
 \r
+\r
 %\r
 % Yenyova cast\r
 %\r