]> www.fi.muni.cz Git - pan13-paper.git/blob - pan13-poster/poster.tex
2e3dce626e58ed739ee0e64e6fc9e202b659b37e
[pan13-paper.git] / pan13-poster / poster.tex
1 \documentclass[a0,portrait]{sciposter}\r
2 \r
3 \usepackage{epsfig}\r
4 \usepackage{amsmath}\r
5 \usepackage{amssymb}\r
6 \usepackage{multicol}\r
7 \usepackage{bera}\r
8 \usepackage[utf8]{inputenc}\r
9 %\usepackage{fancybullets}\r
10 %\usepackage{floatflt}\r
11 %\usepackage{graphics}\r
12 \r
13 \definecolor{BoxCol}{rgb}{0.9,0.9,1}\r
14 % uncomment for light blue background to \section boxes \r
15 % for use with default option boxedsections\r
16 \r
17 \definecolor{SectionCol}{rgb}{0,0,0.5}\r
18 % uncomment for dark blue \section text \r
19 \r
20 \definecolor{ReallyEmph}{rgb}{0.7,0,0}\r
21 \r
22 \renewcommand{\titlesize}{\Huge}\r
23 \title{Diverse Queries and Feature Type Selection \\ for Plagiarism Discovery}\r
24 \r
25 % Note: only give author names, not institute\r
26 \author{Šimon Suchomel, Jan Kasprzak, and Michal Brandejs}\r
27  \r
28 % insert correct institute name\r
29 \institute{Faculty of Informatics, Masaryk University, Brno, Czech Republic}\r
30 \r
31 % \email{kas@fi.muni.cz}  % shows author email address below institute\r
32 \r
33 %\date is unused by the current \maketitle\r
34 \r
35 \font\logofont=fi-logo600 at .16\textwidth\r
36 \r
37 \renewcommand{\sectionsize}{\Large}\r
38 \r
39 \newcommand{\cemph}[1]{{\sffamily\bfseries\itshape \textcolor{SectionCol}{#1}}}\r
40 \newcommand{\lemph}[1]{{\rmfamily\itshape \textcolor{SectionCol}{#1}}}\r
41 \newcommand{\eitem}[1]{\item \cemph{#1}}\r
42 \r
43 \newenvironment{ytemize}\r
44   { \begin{itemize}\r
45         \setlength{\itemsep}{0pt}\r
46         \setlength{\parskip}{0pt}\r
47   }\r
48   { \end{itemize} }\r
49 \r
50 \conference{{\bf CLEF 2013}, 23--27 September 2013, Valencia, Spain}\r
51 \r
52 \setlength{\figbotskip}{\smallskipamount}\r
53 \r
54 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\r
55 %%% Begin of Document\r
56 \r
57 \begin{document}\r
58 \r
59 \r
60 %\LEFTSIDEfootlogo  \r
61 % Uncomment to put footer logo on left side, and \r
62 % conference name on right side of footer\r
63 \r
64 % Some examples of caption control (remove % to check result)\r
65 \r
66 %\renewcommand{\algorithmname}{Algoritme} % for Dutch\r
67 \r
68 %\renewcommand{\mastercapstartstyle}[1]{\textit{\textbf{#1}}}\r
69 %\renewcommand{\algcapstartstyle}[1]{\textsc{\textbf{#1}}}\r
70 %\renewcommand{\algcapbodystyle}{\bfseries}\r
71 %\renewcommand{\thealgorithm}{\Roman{algorithm}}\r
72 \r
73 % \maketitle\r
74 \r
75 \vspace*{-.06\textwidth}\r
76 \r
77 \hbox to \hsize{\r
78 \begin{minipage}[c]{.11\textwidth}\r
79         \vspace{-.75\textwidth}\r
80         \hbox{\hskip -.83\textwidth\includegraphics[width=3\textwidth]{znak_MU_modry}\hskip -\textwidth}\r
81         \vspace{-\textwidth}\r
82 \end{minipage}\r
83 \hfil\r
84 \begin{minipage}[c]{.7\textwidth}\r
85 \begin{center}\r
86       \renewcommand{\baselinestretch}{2.0}\normalsize\r
87       {\titlesize \bf \@title}\par\r
88       \renewcommand{\baselinestretch}{1.0}\normalsize            \r
89       \vspace{0.4\titleskip}\r
90       {\authorsize {\bf\@author} \par}\r
91       {\instsize\r
92        \vspace{0.2\titleskip}\r
93        \theinstitute \par\r
94        \ifthenelse{\equal{\printemail}{}}{%nothing\r
95          }{%\r
96          \vspace{0.2\titleskip}\r
97          \texttt{\printemail}\r
98          }\r
99       }\r
100 \end{center}\r
101 \end{minipage}\r
102 \hfil\r
103 \begin{minipage}[c]{.15\textwidth}\r
104         \hbox to \hsize{\logofont SL\hss}\r
105 \end{minipage}\r
106 }\r
107 \r
108 \vspace{-.02\textwidth}\r
109 \r
110 %%% Begin of Multicols-Enviroment\r
111 %\begin{abstract}\r
112 %{\sffamily\itshape\r
113 %Nějaký abstrakt.\r
114 %}\r
115 %\end{abstract}\r
116 \r
117 \r
118 \begin{multicols}{2}\setlength{\columnseprule}{0pt}\r
119 \section{Introduction}\r
120 %\r
121 A program for helping detering real-world plagiarism needs to accomplish many tasks.\r
122 Original documents which served for creation of plagiarism must be retrieved and also suspicious passages according to\r
123 input document must be highlighted. This poster presents methodology used during PAN2013 competition on uncovering plagiarism.\r
124 \r
125 The whole process is depicted at picture~\ref{fig:process}. The source retrieval task is divided into\r
126 2 subtasks: Quering and Selecting, during which the software utilizes given search engine. The retrieved\r
127 sources must be examined in detail in order to highlight as many plagiarism cases as possible. This process is depicted\r
128 as Text Alignment.\r
129 \r
130 %\r
131 \vfill\r
132 \columnbreak\r
133 %\r
134 \begin{figure}\r
135  \centering\r
136   \includegraphics[width=0.7\textwidth]{img/source_retrieval_process.pdf}\r
137   \caption{Plagiarism discovery process.}\r
138   \label{fig:process}\r
139 \end{figure} \r
140 \end{multicols}\r
141 \begin{multicols}{2}\r
142 %\rm\r
143 %%% Introduction\r
144 \section{Querying}\r
145 Querying means to effectively utilize the search engine in order to retrieve as many relevant\r
146 documents as possible with the minimum amount of queries.\r
147 %We consider the resulting document relevantif it shares some of text characteristics with the suspicious document.\r
148 In real-world queries as such represent appreciable cost, therefore their minimization should be one of the top priorities. \r
149 %\subsection{Types of Queries}\r
150 From the suspicious document, there were three diverse types of queries extracted.\\\r
151 \begin{minipage}{0.55\linewidth}\r
152 \subsection{Keywords Based Queries}\r
153 \begin{ytemize}\r
154 \item TF--IDF base automated keywords extraction;\r
155 \item 5-token long; \r
156 \item Deterministic;\r
157 \item Non-positional;\r
158 \item Non-phrasal.\r
159 \end{ytemize}\r
160 \end{minipage}\r
161 \begin{minipage}{0.45\linewidth}\r
162 \begin{figure}[h]\r
163  %\centering\r
164   \includegraphics[width=1\linewidth]{img/document_keywords.pdf}\r
165 \end{figure}\r
166 \end{minipage}\r
167 \begin{minipage}{0.55\linewidth}\r
168 \subsection{Intrinsic Plagiarism Based Queries}\r
169 \begin{ytemize}\r
170 \item Averaged Word Frequency Class based chunking~\cite{AWFC};\r
171 \item Random sentence selection from the chunk;\r
172 \item Non-deterministic;\r
173 \item Positional;\r
174 \item Phrasal.\r
175 \end{ytemize}\r
176 \end{minipage}\r
177 \begin{minipage}{0.45\linewidth}\r
178 \begin{figure}[h]\r
179  %\centering\r
180   \includegraphics[width=1\linewidth]{img/document_awfc.pdf}\r
181 \end{figure}\r
182 \end{minipage}\r
183 \begin{minipage}{0.55\linewidth}\r
184 \subsection{Paragraph Based Queries}\r
185 \begin{ytemize}\r
186 \item Longest sentences from miscellaneous paragraphs;\r
187 \item Deterministic;\r
188 \item Positional;\r
189 \item Phrasal.\r
190 \end{ytemize}\r
191 \end{minipage}\r
192 \begin{minipage}{0.45\linewidth}\r
193 \begin{figure}[h]\r
194  %\centering\r
195   \includegraphics[width=1\linewidth]{img/document_paragraphs.pdf}\r
196 \end{figure}\r
197 \end{minipage}\r
198 \r
199 \begin{figure}[h]\r
200  \centering\r
201   \includegraphics[width=0.8\linewidth]{img/queryprocess.pdf}\r
202    \caption{Stepwise queries execution process.}\r
203 \end{figure}\r
204 \r
205 \section{Selecting}\r
206 Document snippets were used for deciding whether to download the document for the text alignment.\r
207 We used 2-tuples measurement, which indicates how many neighbouring word pairs coexist in the snippet and in the suspicious document.\r
208 Performance of this measure is depicted at picture~\ref{fig:snippet_graph}.\r
209 Having this measure, a threshold for download decision needs to be set in order to maximize all discovered similarities\r
210 and minimize total downloads.\r
211 A profitable threshold is such that matches with the largest distance between those two curves.\r
212 \begin{figure}\r
213   \centering\r
214   \includegraphics[width=0.8\textwidth]{img/snippets_graph.pdf}\r
215   \caption{Downloads and similarities performance.}\r
216   \label{fig:snippet_graph}\r
217 \end{figure}\r
218 \r
219 \r
220 %\r
221 % Yenyova cast\r
222 %\r
223 \r
224 \section{Text Alignment}\r
225 \r
226 %\r
227 % Spolecna cast\r
228 %\r
229 \r
230 \section{Conclusion}\r
231 \r
232 Nějaký závěr\r
233 \r
234 %%% References\r
235 \r
236 %% Note: use of BibTeX als works!!\r
237 \r
238 \bibliographystyle{plain}\r
239 \begin{thebibliography}{1}\r
240 \r
241 \bibitem{ISMU}\r
242 \cemph{Masaryk University Information System}\\\r
243 {\tt http://is.muni.cz/}, contact: {\tt iscor@fi.muni.cz}.\r
244 \r
245 \bibitem{Theses}\r
246 \cemph{Czech National Archive of Graduate Theses}\\\r
247 {\tt http://theses.cz/}, contact: {\tt theses@fi.muni.cz}.\r
248 \r
249 \bibitem{AWFC}\r
250 \cemph{Sven Meyer Zu Eissen and Benno Stein: Intrinsic Plagiarism Detection}\\\r
251 {\tt Proceedings of the European Conference on Information Retrieval (ECIR-06)}, {\tt 2006}\r
252 \r
253 \end{thebibliography}\r
254 \r
255 \smallskip\r
256 \hrule height .1em\r
257 \medskip\r
258 \r
259 % \sffamily\r
260 \r
261 QR kód?\r
262 \r
263 \cemph{Contact information:}\\\r
264         Šimon Suchomel {\tt suchomel@fi.muni.cz},\\\r
265         Jan Kasprzak, {\tt kas@fi.muni.cz}.\r
266 \r
267 \r
268 \end{multicols}\r
269 \r
270 \end{document}\r
271 \r