]> www.fi.muni.cz Git - pan13-paper.git/commitdiff
1. nastrel
authorSimon Suchomel <xsuchom1@anxur.fi.muni.cz>
Wed, 18 Sep 2013 20:43:05 +0000 (22:43 +0200)
committerSimon Suchomel <xsuchom1@anxur.fi.muni.cz>
Wed, 18 Sep 2013 20:43:05 +0000 (22:43 +0200)
pan13-paper/pan13-notebook.pdf [deleted file]
pan13-poster/img/snippets_graph.pdf [new file with mode: 0755]
pan13-poster/img/source_retrieval_process.pdf [new file with mode: 0755]
pan13-poster/poster.tex [changed mode: 0644->0755]

diff --git a/pan13-paper/pan13-notebook.pdf b/pan13-paper/pan13-notebook.pdf
deleted file mode 100644 (file)
index 83b37bc..0000000
Binary files a/pan13-paper/pan13-notebook.pdf and /dev/null differ
diff --git a/pan13-poster/img/snippets_graph.pdf b/pan13-poster/img/snippets_graph.pdf
new file mode 100755 (executable)
index 0000000..96192d0
Binary files /dev/null and b/pan13-poster/img/snippets_graph.pdf differ
diff --git a/pan13-poster/img/source_retrieval_process.pdf b/pan13-poster/img/source_retrieval_process.pdf
new file mode 100755 (executable)
index 0000000..d275b61
Binary files /dev/null and b/pan13-poster/img/source_retrieval_process.pdf differ
old mode 100644 (file)
new mode 100755 (executable)
index ce9f7a3..c3ab1c5
@@ -18,7 +18,7 @@
 \definecolor{ReallyEmph}{rgb}{0.7,0,0}\r
 \r
 \renewcommand{\titlesize}{\Huge}\r
-\title{Distributed System \\ for Discovering Similar Documents}\r
+\title{Diverse Queries and Feature Type Selection \\ for Plagiarism Discovery}\r
 \r
 % Note: only give author names, not institute\r
 \author{Šimon Suchomel, Jan Kasprzak, and Michal Brandejs}\r
@@ -45,7 +45,7 @@
   }\r
   { \end{itemize} }\r
 \r
-\conference{{\bf ICEIS 2008}, 12--16 June 2008, Barcelona, Spain}\r
+\conference{{\bf CLEF 2013}, 23--27 September 2013, Valencia, Spain}\r
 \r
 \setlength{\figbotskip}{\smallskipamount}\r
 \r
 \vspace{-.02\textwidth}\r
 \r
 %%% Begin of Multicols-Enviroment\r
-\begin{multicols}{2}\r
+%\begin{abstract}\r
+%{\sffamily\itshape\r
+%Nějaký abstrakt.\r
+%}\r
+%\end{abstract}\r
 \r
-\rm\r
-%%% Abstract\r
-\begin{abstract}\r
 \r
-{\sffamily\itshape\r
+\begin{multicols}{2}\setlength{\columnseprule}{0pt}\r
 \r
-Nějaký abstrakt.\r
 \r
-}\r
-\end{abstract}\r
+\section{Introduction}\r
+PAN 2013 LOrem ipsum Lorem ipsum Lorem ipsumLorem ipsumLorem ipsumLorem ipsumLorem ipsum \r
+\r
+\r
+\r
+\begin{figure}\r
+ \centering\r
+  \includegraphics[width=0.8\textwidth]{img/source_retrieval_process.pdf}\r
+  \caption{Plagiarism discovery process.}\r
+  \label{fig:process}\r
+\end{figure} \r
+\r
+\r
+\end{multicols}\r
 \r
-%%% Introduction\r
-\section{Šimonova část}\r
 \r
-\subsection{Kdovíco}\r
 \r
-\section{Yenyova část}\r
+\begin{multicols}{2}\r
+\r
+%\rm\r
+\r
+%%% Introduction\r
+\section{Querying}\r
+Querying means to effectively utilize the search engine in order to retrieve as many relevant\r
+documents as possible with the minimum amount of queries.\r
+%We consider the resulting document relevantif it shares some of text characteristics with the suspicious document.\r
+In real-world queries as such represent appreciable cost, therefore their minimization should be one of the top priorities. \\\r
+\subsection{Types of Queries}\r
+From the suspicious document, there were three diverse types of queries extracted.\r
+\subsubsection{Keywords Based Queries}\r
+\begin{ytemize}\r
+\item TF--IDF base automated keywords extraction;\r
+\item 5-token long; \r
+\item Deterministic;\r
+\item Non-positional;\r
+\item Non-phrasal.\r
+\end{ytemize}\r
+\subsubsection{Intrinsic Plagiarism Based Queries}\r
+\begin{ytemize}\r
+\item Averaged Word Frequency Class based chunking~\cite{AWFC};\r
+\item Random sentence selection from the chunk;\r
+\item Non-deterministic;\r
+\item Positional;\r
+\item Phrasal.\r
+\end{ytemize}\r
+\subsubsection{Paragraph Based Queries}\r
+\begin{ytemize}\r
+\item Longest sentences from miscellaneous paragraphs;\r
+\item Deterministic;\r
+\item Positional;\r
+\item Phrasal.\r
+\end{ytemize}\r
+\r
+\section{Selecting}\r
+\begin{figure}\r
+  \centering\r
+  \includegraphics[width=0.8\textwidth]{img/snippets_graph.pdf}\r
+  \caption{Downloads and similarities performance.}\r
+  \label{fig:snippet_graph}\r
+\end{figure}\r
+\r
+\section{Text Alignment}\r
 \r
 \section{Conclusion}\r
 \r
@@ -145,6 +198,10 @@ Nějaký závěr
 \cemph{Czech National Archive of Graduate Theses}\\\r
 {\tt http://theses.cz/}, contact: {\tt theses@fi.muni.cz}.\r
 \r
+\bibitem{AWFC}\r
+\cemph{Sven Meyer Zu Eissen and Benno Stein: Intrinsic Plagiarism Detection}\\\r
+{\tt Proceedings of the European Conference on Information Retrieval (ECIR-06)}, {\tt 2006}\r
+\r
 \end{thebibliography}\r
 \r
 \smallskip\r