]> www.fi.muni.cz Git - pan13-paper.git/blob - pan13-poster/poster.tex
44a14eebe9fae3c4eb52fb05338e017d8bfb9338
[pan13-paper.git] / pan13-poster / poster.tex
1 \documentclass[a0,portrait]{sciposter}\r
2 \r
3 \usepackage{epsfig}\r
4 \usepackage{amsmath}\r
5 \usepackage{amssymb}\r
6 \usepackage{multicol}\r
7 \usepackage[utf8]{inputenc}\r
8 %\usepackage{fancybullets}\r
9 %\usepackage{floatflt}\r
10 %\usepackage{graphics}\r
11 \usepackage{fontspec}\r
12 \usepackage{xunicode}\r
13 \setmainfont[Mapping=tex-text]{DejaVu Sans}\r
14 \setsansfont[Mapping=tex-text]{DejaVu Sans}\r
15 \setmonofont[Mapping=tex-text]{DejaVu Sans Mono}\r
16 \r
17 \definecolor{BoxCol}{rgb}{0.9,0.9,1}\r
18 % uncomment for light blue background to \section boxes \r
19 % for use with default option boxedsections\r
20 \r
21 \definecolor{SectionCol}{rgb}{0,0,0.5}\r
22 % uncomment for dark blue \section text \r
23 \r
24 \definecolor{ReallyEmph}{rgb}{0.7,0,0}\r
25 \r
26 \renewcommand{\titlesize}{\Huge}\r
27 \title{Diverse Queries and Feature Type Selection \\ for Plagiarism Discovery}\r
28 \r
29 % Note: only give author names, not institute\r
30 \author{Šimon Suchomel, Jan Kasprzak, and Michal Brandejs}\r
31  \r
32 % insert correct institute name\r
33 \institute{Faculty of Informatics, Masaryk University, Brno, Czech Republic}\r
34 \r
35 % \email{kas@fi.muni.cz}  % shows author email address below institute\r
36 \r
37 %\date is unused by the current \maketitle\r
38 \r
39 \font\logofont=fi-logo600 at .16\textwidth\r
40 \r
41 \renewcommand{\sectionsize}{\Large}\r
42 \r
43 \newcommand{\cemph}[1]{{\sffamily\bfseries\itshape \textcolor{SectionCol}{#1}}}\r
44 \newcommand{\lemph}[1]{{\rmfamily\itshape \textcolor{SectionCol}{#1}}}\r
45 \newcommand{\eitem}[1]{\item \cemph{#1}}\r
46 \r
47 \newenvironment{ytemize}\r
48   { \begin{itemize}\r
49         \setlength{\itemsep}{0pt}\r
50         \setlength{\parskip}{0pt}\r
51   }\r
52   { \end{itemize} }\r
53 \r
54 \conference{{\bf CLEF 2013}, 23--27 September 2013, Valencia, Spain}\r
55 \r
56 \setlength{\figbotskip}{\smallskipamount}\r
57 \r
58 \renewcommand{\SubSection}[2][?]{\r
59   \vspace{0.5\secskip}\r
60   \refstepcounter{subsection}\r
61   {\bf \subsectionsize \textcolor{SectionCol}{\arabic{section}.\arabic{subsection}~#2}}\r
62   \par\vspace{0.375\secskip}\r
63 }\r
64 \r
65 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\r
66 %%% Begin of Document\r
67 \r
68 \begin{document}\r
69 \r
70 \r
71 %\LEFTSIDEfootlogo  \r
72 % Uncomment to put footer logo on left side, and \r
73 % conference name on right side of footer\r
74 \r
75 % Some examples of caption control (remove % to check result)\r
76 \r
77 %\renewcommand{\algorithmname}{Algoritme} % for Dutch\r
78 \r
79 %\renewcommand{\mastercapstartstyle}[1]{\textit{\textbf{#1}}}\r
80 %\renewcommand{\algcapstartstyle}[1]{\textsc{\textbf{#1}}}\r
81 %\renewcommand{\algcapbodystyle}{\bfseries}\r
82 %\renewcommand{\thealgorithm}{\Roman{algorithm}}\r
83 \r
84 % \maketitle\r
85 \r
86 \vspace*{-.06\textwidth}\r
87 \r
88 \hbox to \hsize{\r
89 \begin{minipage}[c]{.11\textwidth}\r
90         \vspace{-.75\textwidth}\r
91         \hbox{\hskip -.83\textwidth\includegraphics[width=3\textwidth]{znak_MU_modry}\hskip -\textwidth}\r
92         \vspace{-\textwidth}\r
93 \end{minipage}\r
94 \hfil\r
95 \begin{minipage}[c]{.7\textwidth}\r
96 \begin{center}\r
97       \renewcommand{\baselinestretch}{2.0}\normalsize\r
98       {\titlesize \bf \@title}\par\r
99       \renewcommand{\baselinestretch}{1.0}\normalsize            \r
100       \vspace{0.4\titleskip}\r
101       {\authorsize {\bf\@author} \par}\r
102       {\instsize\r
103        \vspace{0.2\titleskip}\r
104        \theinstitute \par\r
105        \ifthenelse{\equal{\printemail}{}}{%nothing\r
106          }{%\r
107          \vspace{0.2\titleskip}\r
108          \texttt{\printemail}\r
109          }\r
110       }\r
111 \end{center}\r
112 \end{minipage}\r
113 \hfil\r
114 \begin{minipage}[c]{.15\textwidth}\r
115         \hbox to \hsize{\logofont SL\hss}\r
116 \end{minipage}\r
117 }\r
118 \r
119 \vspace{-.02\textwidth}\r
120 \r
121 %%% Begin of Multicols-Enviroment\r
122 %\begin{abstract}\r
123 %{\sffamily\itshape\r
124 %Nějaký abstrakt.\r
125 %}\r
126 %\end{abstract}\r
127 \r
128 \r
129 \begin{multicols}{2}\setlength{\columnseprule}{0pt}\r
130 \section{Introduction}\r
131 %\r
132 A program for helping detering real-world plagiarism needs to accomplish many tasks.\r
133 Original documents which served for creation of plagiarism must be retrieved and also suspicious passages according to\r
134 input document must be highlighted. This poster presents methodology used during PAN2013 competition on uncovering plagiarism.\r
135 \r
136 The whole process is depicted at picture~\ref{fig:process}. The source retrieval task is divided into\r
137 2 subtasks: Quering and Selecting, during which the software utilizes a given search engine. The retrieved\r
138 sources must be examined in detail in order to highlight as many plagiarism cases as possible. This process is depicted\r
139 as Text Alignment. Results of this process are called {\em detections}, i.e.~passages of {\em source document} and {\em suspicious document}, which are similar enough to each other, and can serve as a basis for further manual examination for possible plagiarism.\r
140 %\r
141 \vfill\r
142 \columnbreak\r
143 %\r
144 \begin{figure}\r
145  \centering\r
146   \includegraphics[width=0.8\textwidth]{img/source_retrieval_process.pdf}\r
147   \caption{Plagiarism discovery process.}\r
148   \label{fig:process}\r
149 \end{figure} \r
150 \end{multicols}\r
151 \begin{multicols}{2}\r
152 %\rm\r
153 %%% Introduction\r
154 \section{Querying}\r
155 Querying means to effectively utilize a search engine in order to retrieve as many relevant\r
156 documents as possible with the minimum amount of queries.\r
157 %We consider the resulting document relevantif it shares some of text characteristics with the suspicious document.\r
158 In real-world, queries as such represent appreciable cost, therefore their quantity minimization should be one of the top priorities. \r
159 %\subsection{Types of Queries}\r
160 During initial phase, there were three diverse types of queries extracted from each suspicious document.\\\r
161 \begin{minipage}{0.55\linewidth}\r
162 \subsection{Keywords Based Queries}\r
163 \begin{ytemize}\r
164 \item TF--IDF base automated keywords extraction;\r
165 \item 5-token long; \r
166 \item Deterministic;\r
167 \item Non-positional;\r
168 \item Non-phrasal.\r
169 \end{ytemize}\r
170 \end{minipage}\r
171 \begin{minipage}{0.45\linewidth}\r
172 \begin{figure}[h]\r
173  %\centering\r
174   \includegraphics[width=1\linewidth]{img/document_keywords.pdf}\r
175 \end{figure}\r
176 \end{minipage}\r
177 \begin{minipage}{0.55\linewidth}\r
178 \subsection{Intrinsic Plagiarism Based Queries}\r
179 \begin{ytemize}\r
180 \item Averaged Word Frequency Class based chunking~\cite{awfc};\r
181 \item Random sentence selection from the chunk;\r
182 \item Non-deterministic;\r
183 \item Positional;\r
184 \item Phrasal.\r
185 \end{ytemize}\r
186 \end{minipage}\r
187 \begin{minipage}{0.45\linewidth}\r
188 \begin{figure}[h]\r
189  %\centering\r
190   \includegraphics[width=1\linewidth]{img/document_awfc.pdf}\r
191 \end{figure}\r
192 \end{minipage}\r
193 \begin{minipage}{0.55\linewidth}\r
194 \subsection{Paragraph Based Queries}\r
195 \begin{ytemize}\r
196 \item Longest sentences from miscellaneous paragraphs;\r
197 \item Deterministic;\r
198 \item Positional;\r
199 \item Phrasal.\r
200 \end{ytemize}\r
201 \end{minipage}\r
202 \begin{minipage}{0.45\linewidth}\r
203 \begin{figure}[h]\r
204  %\centering\r
205   \includegraphics[width=1\linewidth]{img/document_paragraphs.pdf}\r
206 \end{figure}\r
207 \end{minipage}\r
208 \r
209 \begin{figure}[h]\r
210  \centering\r
211   \includegraphics[width=0.8\linewidth]{img/queryprocess.pdf}\r
212    \caption{Stepwise queries execution process.}\r
213 \end{figure}\r
214 \r
215 \section{Selecting}\r
216 Document snippets were used for deciding whether to download the document for the text alignment.\r
217 We used 2-tuples measurement, which indicates how many neighbouring word pairs coexist in the snippet and in the suspicious document.\r
218 Performance of this measure is depicted at Figure~\ref{fig:snippet_graph}.\r
219 Having this measure, a threshold for download decision needs to be set in order to maximize all discovered similarities\r
220 and minimize total downloads.\r
221 A profitable threshold is such that matches with the largest distance between those two curves.\r
222 \begin{figure}\r
223   \centering\r
224   \includegraphics[width=0.8\textwidth]{img/snippets_graph.pdf}\r
225   \caption{Downloads and similarities performance.}\r
226   \label{fig:snippet_graph}\r
227 \end{figure}\r
228 %\r
229 % Yenyova cast\r
230 %\r
231 \section{Text Alignment}\r
232 \r
233 The system uses the same basic principles as in \cite{suchomel_kas_12}:\r
234 \r
235 \begin{ytemize}\r
236 \item{\cemph{common features} between source and suspicious documents}\r
237 \begin{ytemize}\r
238 \item{word 5-grams}\r
239 \item{stop-word 8-grams \cite{stamatatos2011plagiarism}}\r
240 \end{ytemize}\r
241 \item{\cemph{valid intervals} of characters covered by common features\r
242         ``densely enough''}\r
243 \item{\cemph{postprocessing}---remove overlapping detections,\r
244         join neighbouring detections}\r
245 \end{ytemize}\r
246 \r
247 \subsection{Alternative Features}\r
248 \r
249 \begin{ytemize}\r
250 \item{\cemph{contextual n-grams} \cite{torrejondetailed}}\r
251 \begin{ytemize}\r
252 \item{\cemph{The quick} brown \cemph{fox jumped} over the lazy dogs.}\r
253 \item{The \cemph{quick brown} fox \cemph{jumped over} the lazy dogs.}\r
254 \end{ytemize}\r
255 \item{plain word 4-grams}\r
256 \begin{ytemize}\r
257 \item{\cemph{The quick brown fox} jumped over the lazy dogs.}\r
258 \item{The \cemph{quick brown fox jumped} over the lazy dogs.}\r
259 \end{ytemize}\r
260 \end{ytemize}\r
261 \r
262 \begin{table}\r
263 \r
264 \begin{center}\r
265 \begin{tabular}{|l|r|r|r|r|}\r
266 \hline\r
267 \bf feature & \bf recall & \bf precision & \bf granularity & plagdet \\\r
268 \hline\r
269 plain      5-grams & 0.6306 & 0.8484 & 1.0000 & \cemph{0.7235} \\\r
270 contextual 4-grams & 0.6721 & \cemph{0.8282} & 1.0000 & \cemph{0.7421} \\\r
271 plain      4-grams & \cemph{0.7556} & 0.7340 & 1.0000 & \cemph{0.7447} \\\r
272 \hline\r
273 \end{tabular}\r
274 \end{center}\r
275 \r
276 \caption{Comparison of contextual 4-grams and plain word 4-grams}\r
277 \end{table}\r
278 \r
279 \subsection{Global Postprocessing}\r
280 \r
281 \begin{ytemize}\r
282 \item{Similar to PAN 2010 \cite{Kasprzak2010}}\r
283 \item{Overlapping detections removal}\r
284 \item{\cemph{Result:} improvement, but not as significant as in 2010}\r
285 \end{ytemize}\r
286 \r
287 %\r
288 % Spolecna cast\r
289 %\r
290 \r
291 \section{Conclusion}\r
292 \r
293 \subsection{Candidate retrieval}\r
294 \r
295 \begin{ytemize}\r
296 \item{Second best ratio of recall to the number of queries}\r
297 \item{Missing support for phrasal search in ChatNoir is a big stumbling block}\r
298 \end{ytemize}\r
299 \r
300 \subsection{Text alignment}\r
301 \r
302 \begin{ytemize}\r
303 \item{Significant improvement against PAN 2013}\r
304 \item{Word 4-grams are better than contextual 4-grams}\r
305 \item{We need a better ranking system than plagdet!}\r
306 \end{ytemize}\r
307 \r
308 %%% References\r
309 \r
310 %% Note: use of BibTeX als works!!\r
311 \r
312 \bibliographystyle{plain}\r
313 \bibliography{pan13-notebook}\r
314 \nocite{awfc}\r
315 \r
316 %\begin{thebibliography}{1}\r
317 %\r
318 %\bibitem{ISMU}\r
319 %\cemph{Masaryk University Information System}\\\r
320 %{\tt http://is.muni.cz/}, contact: {\tt iscor@fi.muni.cz}.\r
321 %\r
322 %\bibitem{Theses}\r
323 %\cemph{Czech National Archive of Graduate Theses}\\\r
324 %{\tt http://theses.cz/}, contact: {\tt theses@fi.muni.cz}.\r
325 %\r
326 %\bibitem{AWFC}\r
327 %\cemph{Sven Meyer Zu Eissen and Benno Stein: Intrinsic Plagiarism Detection}\\\r
328 %{\tt Proceedings of the European Conference on Information Retrieval (ECIR-06)}, {\tt 2006}\r
329 %\r
330 %\end{thebibliography}\r
331 \r
332 \smallskip\r
333 \hrule height .1em\r
334 \medskip\r
335 \r
336 % \sffamily\r
337 \r
338 \r
339 \hbox to \hsize{\r
340         {\hsize=0.5\hsize\vbox{\r
341 \cemph{Contact information:}\\\r
342         Šimon Suchomel {\tt suchomel@fi.muni.cz}\\\r
343         Jan Kasprzak {\tt kas@fi.muni.cz}\\\r
344         {\cemph{\tt http://www.fi.muni.cz/\~{}kas/pan13/}}\r
345 }\r
346         \hfill\r
347         {\hsize=0.4\hsize\vbox{\r
348         \includegraphics[width=\hsize]{qrcode.png}\r
349 }}}}\r
350         \r
351 \r
352 \end{multicols}\r
353 \r
354 \end{document}\r