# -*- coding: utf-8 -*- """BeautifulSoup_parsing_UA.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/17XCLu5baQY1Yeq3BsniYHoPEjY-w-ZJD """ !pip install beautifulsoup4 !pip install feedparser !pip install pandas import feedparser import pandas as pd !pip install readability-lxml rt_feed = pd.read_csv('/content/up_links.csv') import requests from readability.readability import Document import lxml.html as html from lxml.html.clean import Cleaner import re import re from bs4 import BeautifulSoup article_titles = [] article_dates = [] article_texts = [] article_tags = [] for link in rt_feed.link: page = requests.get(link) page = page.text soup = BeautifulSoup(page, 'html.parser') title_element = soup.find('h1', class_='post_title') if title_element: article_titles.append(title_element.get_text()) else: article_titles.append("Нет названия") date_element = soup.find('div', class_='post_time') if date_element: article_dates.append(date_element.get_text()) else: article_dates.append("Нет даты") article_text = "" article_element = soup.find('div', class_='block_post') if article_element: paragraphs = article_element.find_all('p') for paragraph in paragraphs: article_text += paragraph.get_text() + "\n" else: article_text = "Текст статьи не найден" article_texts.append(article_text) tags_element = soup.find('div', class_='post_tags') if tags_element: tags = tags_element.find_all('a') tag_text = [tag.get_text() for tag in tags] article_tags.append(", ".join(tag_text)) else: article_tags.append("Теги не найдены") rt_data = pd.DataFrame({'Title': article_titles, 'Date': article_dates, 'Text': article_texts, 'Tags': article_tags}) rt_feed = pd.concat([rt_feed, rt_data], axis=1) rt_feed.head rt_feed.to_csv('up_fulltext.csv') # -*- coding: utf-8 -*- """Beautiful_soup_parsing_RU.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1aW_C4OTzMEhYXYltdpk9LkZl7_PYcNFU """ !pip install beautifulsoup4 !pip install feedparser !pip install pandas import feedparser import pandas as pd !pip install readability-lxml rt_feed = pd.read_csv('/content/rt_links_final.csv') import requests from readability.readability import Document import lxml.html as html from lxml.html.clean import Cleaner import re import re from bs4 import BeautifulSoup article_titles = [] article_dates = [] article_texts = [] for link in rt_feed.link: page = requests.get(link) page = page.text soup = BeautifulSoup(page, 'html.parser') # 1 title_element = soup.find('h1', class_='article__heading article__heading_article-page') date_element = soup.find('div', class_='article__date article__date_article-page') summary_element = soup.find('div', class_='article__summary article__summary_article-page js-mediator-article') text_element = soup.find('div', class_='article__text article__text_article-page js-mediator-article') if title_element and date_element: article_titles.append(title_element.get_text()) article_dates.append(date_element.get_text()) if text_element: article_texts.append(text_element.get_text(separator=" ")) elif summary_element: article_texts.append(summary_element.get_text(separator=" ")) else: article_texts.append("Нет текста статьи") else: # 2 title_element = soup.find('h1', class_='title') date_element = soup.find('div', class_='date') text_element = soup.find('div', class_='ArticleView-text') summary_element = soup.find('div', class_='ArticleView-summary') if title_element and date_element: article_titles.append(title_element.get_text()) article_dates.append(date_element.get_text()) if text_element: article_texts.append(text_element.get_text(separator=" ")) elif summary_element: article_texts.append(summary_element.get_text(separator=" ")) else: article_texts.append("Нет текста статьи") else: article_titles.append("Нет названия") article_dates.append("Нет даты") article_texts.append("Нет текста статьи") rt_data = pd.DataFrame({'Title': article_titles, 'Date': article_dates, 'Text': article_texts}) rt_feed = pd.concat([rt_feed, rt_data], axis=1) rt_feed.to_csv('radiosvoboda_fulltext_2.csv') rt_feed # -*- coding: utf-8 -*- """LDA_stanza_UA.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1N1LrapF5Q5-qvGsfc1vxCLo9yb9g7sSH """ !pip install stanza !pip install scikit-learn !pip install gensim !pip install nltk !pip install stanza import pandas as pd import stanza import nltk from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import LatentDirichletAllocation from gensim.models import Phrases from gensim.models.phrases import Phraser stanza.download('uk') import requests nltk.download('stopwords') nltk.download('punkt') import string from nltk.corpus import stopwords stopwords_ua = pd.read_csv("stopwords_ua.txt", header=None, names=['stopwords']) stop_words_ua = list(stopwords_ua.stopwords) data = pd.read_csv("/content/up_fulltext - new.csv") nlp = stanza.Pipeline('uk', processors='tokenize,lemma') corpus = data["Text"] import string cleaned_corpus = [] for text in corpus: cleaned_text = ''.join([char for char in text if char not in string.punctuation]) cleaned_corpus.append(cleaned_text) lemmatized_corpus = [] for text in cleaned_corpus: doc = nlp(text) lemmas = [word.lemma.lower() for sent in doc.sentences for word in sent.words if word.lemma.lower() not in stop_words_ua] lemmatized_corpus.append(' '.join(lemmas)) bigram = Phrases([text.split() for text in lemmatized_corpus], min_count=2, threshold=1) trigram = Phrases(bigram[[text.split() for text in lemmatized_corpus]], min_count=2, threshold=1) bigram_corpus = [bigram[text.split()] for text in lemmatized_corpus] trigram_corpus = [trigram[bigram_text] for bigram_text in bigram_corpus] from sklearn.feature_extraction.text import TfidfVectorizer corpus_with_ngrams = [" ".join(text) for text in trigram_corpus] tfidf_vectorizer = TfidfVectorizer(max_df=0.85, min_df=8) tfidf_matrix = tfidf_vectorizer.fit_transform(corpus_with_ngrams) from sklearn.decomposition import LatentDirichletAllocation num_topics = 13 lda = LatentDirichletAllocation(n_components=num_topics, random_state=50) lda.fit(tfidf_matrix) topic_terms = [] for topic in lda.components_: term_weights = [(tfidf_vectorizer.get_feature_names_out()[i], topic[i]) for i in topic.argsort()[::-1]] topic_terms.append(term_weights) for i, topic in enumerate(topic_terms): print(f"Тема {i + 1}:") for term, weight in topic[:10]: print(f"{term}: {weight:.2f}") print() topic_distribution = lda.transform(tfidf_matrix) threshold = 0.5 assigned_topics = [topic.argmax() if topic.max() > threshold else -1 for topic in topic_distribution] topics = assigned_topics data['Topic'] = topics data.head(100) data.to_csv('up_fulltext_topics.csv') import matplotlib.pyplot as plt import matplotlib.pyplot as plt from wordcloud import WordCloud lemmatized_text = ' '.join(lemmatized_corpus) wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lemmatized_text) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.show() from wordcloud import WordCloud import matplotlib.pyplot as plt lemmatized_text = ' '.join(lemmatized_corpus) wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lemmatized_text) wordcloud_words = wordcloud.words_ word_list = list(wordcloud_words.keys()) print(word_list) from wordcloud import WordCloud import matplotlib.pyplot as plt translation_dict = { 'україна': 'ukraine', 'росія': 'russia', 'джерело': 'source', 'російський': 'russian', 'деталі': 'details', 'місто': 'city', 'київ': 'kyiv', 'український': 'ukrainian', 'пряма': 'direct', 'мова': 'speech', 'рф': 'rf', 'заявити': 'declare', 'країна': 'country', 'володимир зеленський': 'volodymyr zelenskyy', 'переговори': 'negotiations', 'лютий': 'february', 'повідомити': 'report', 'війна': 'war', 'росіянин': 'russian', 'ворог': 'enemy', 'санкція': 'sanction', 'військовий': 'military', 'інформація': 'information', 'харків': 'kharkiv', 'дослівно': 'literally', 'держава': 'state', 'українець': 'ukrainian', 'рішення': 'decision', 'президент україна': 'president of ukraine', 'єс': 'eu', 'президент володимир': 'president volodymyr', 'європейська правда': 'european truth', 'війна україна': 'war ukraine', 'повідомляти': 'report', 'пряма мова': 'direct speech', 'район': 'district', 'готовий': 'ready', 'територія': 'territory', 'громадянин': 'citizen', 'немати': 'not', 'допомога': 'help', 'слово': 'word', 'підтримка': 'support', 'частина': 'part', 'наразі': 'now', 'обстріл': 'shelling', 'напрямок': 'direction', 'зазначити': 'note', 'зокрема': 'in particular', 'працювати': 'work', 'нагадати': 'remind', 'бій': 'battle', 'техніка': 'equipment', 'ситуація': 'situation', 'військо': 'army', 'збройний сила': 'armed force', 'уряд': 'government', 'нато': 'nato', 'березень': 'march', 'сторона': 'side', 'російський військо': 'russian army', 'сила': 'force', 'закордонний справа': 'foreign affairs', 'кордон': 'border', 'путін': 'putin', 'додати': 'add', 'літак': 'plane', 'вважати': 'consider', 'оборона': 'defense', 'місце': 'place', 'безпека': 'security', 'білорусь': 'belarus', 'новий': 'new', 'кремль': 'kremlin', 'говорити': 'speak', 'зсу': 'afu', 'година': 'hour', 'офіс президент': 'president\'s office', 'зброя': 'weapon', 'позиція': 'position', 'європа': 'europe', 'стан': 'state', 'удар': 'strike', 'ракета': 'missile', 'дані': 'data', 'влада': 'power', 'столиця': 'capital', 'росія україна': 'russia ukraine', 'зеленський': 'zelensky', 'дія': 'action', 'бачити': 'see', 'україна джерело': 'ukraine source', 'світ': 'world', 'можливість': 'opportunity', 'закликати': 'call', 'міжнародний': 'international', 'тисяча': 'thousand', 'представник': 'representative', 'сила україна': 'force ukraine', 'міністр закордонний': 'foreign minister', 'народ': 'people', 'мета': 'purpose', 'заява': 'statement', 'йтися': 'go', 'окупант': 'occupier', 'україна володимир': 'ukraine volodymyr', 'життя': 'life', 'захід': 'event', 'вибух': 'explosion', 'наступ': 'offensive', 'зробити': 'do', 'день': 'day', 'особа': 'person', 'намагатися': 'try', 'український військовий': 'ukrainian military', 'передісторія': 'prehistory', 'відбутися': 'happen', 'swift': 'swift', 'разом': 'together', 'бік': 'side', 'втрата': 'loss', 'число': 'number', 'ніч': 'night', 'великий': 'large', 'йти': 'go', 'відбуватися': 'happening', 'президент росія': 'president russia', 'територія україна': 'territory of ukraine', 'центр': 'center', 'напад': 'attack', 'право': 'right', 'отримати': 'receive', 'ракет': 'missiles', 'війна росія': 'war russia', 'президент рф': 'president of russia', 'міністр оборона': 'defense minister', 'мир': 'peace', 'знати': 'know', 'зупинити': 'stop', 'поранений': 'wounded', 'михайло подоляк': 'mykhailo podolyak', 'стати': 'become', 'підрозділ': 'unit', 'загроза': 'threat', 'водночас': 'at the same time', 'свобода': 'freedom', 'будинок': 'house', 'завдання': 'task', 'наголосив': 'emphasized', 'готовність': 'readiness', 'російський федерація': 'russian federation', 'володимир путін': 'volodymyr putin', 'деталі слово': 'details word', 'писати європейська': 'write european', 'укриття': 'shelter', 'область': 'region', 'агресія': 'aggression', 'дитина': 'child', 'доба': 'day', 'відео': 'video', 'питання': 'question', 'написати': 'write', 'захищати': 'defend', 'тривати': 'last', 'москва': 'moscow', 'перебувати': 'stay', 'зустріч': 'meeting', 'передувало': 'preceded', 'європейський союз': 'european union', 'захист': 'protection', 'армія': 'army', 'сайт': 'website', 'загинути': 'die', 'спроба': 'attempt', 'злочин': 'crime', 'радник голова': 'counselor head', 'спеціальний': 'special', 'склад': 'composition', 'дснс': 'ssns', 'лідер': 'leader', 'просити': 'ask', 'кількість': 'number', 'земля': 'land', 'дати': 'date', 'робити': 'do', 'видання': 'edition', 'російський окупант': 'russian occupier', 'президент михайло': 'president mykhailo', 'український делегація': 'ukrainian delegation', 'президент': 'president', 'прикордонник': 'border guard', 'група': 'group', 'цивільний': 'civilian', 'європейський': 'european', 'мзс': 'foreign ministry', 'військовослужбовець': 'serviceman', 'результат': 'result', 'система': 'system', 'володимир': 'volodymyr', 'агресора': 'aggressor', 'євросоюз': 'eu', 'вулиця': 'street' } lemmatized_text = ' '.join(lemmatized_corpus) ukrainian_text = lemmatized_text english_text = ' '.join(translation_dict.get(word, word) for word in ukrainian_text.split()) wordcloud = WordCloud(width=800, height=400, background_color='white').generate(english_text) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.show() import pandas as pd import matplotlib.pyplot as plt import matplotlib.dates as mdates data = pd.read_csv("/content/up_fulltext_topics.csv") data['Дата'] = pd.to_datetime(data['Date'], format='%d.%m.%Y') filtered_data = data[data['Topic'].isin([-1, 0]) == False] data_grouped = filtered_data.groupby(['Дата', 'Topic']).size().unstack(fill_value=0) fig, ax = plt.subplots(figsize=(12, 6)) ax.xaxis.set_major_formatter(mdates.DateFormatter('%d.%m.%Y')) data_grouped.plot(kind='line', ax=ax) plt.xlabel('Date') plt.ylabel('Number of articles by topic') plt.title('Change of topics along the time axis (excluding -1, 0, topics)') plt.legend(title='Topic', bbox_to_anchor=(1.05, 1), loc='upper left') plt.grid(True) plt.xticks(fontsize=10) plt.yticks(fontsize=10) plt.legend(fontsize=10, title_fontsize=12) plt.show() # -*- coding: utf-8 -*- """LDA_RU.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/18OmQvMK5NuuRSkZNKK5NMB2_gqti-wyH """ !pip install stanza !pip install scikit-learn !pip install gensim !pip install nltk import pandas as pd import stanza import nltk from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import LatentDirichletAllocation from gensim.models import Phrases from gensim.models.phrases import Phraser nltk.download('stopwords') nltk.download('punkt') from nltk.corpus import stopwords stopwords_ru = pd.read_csv("/content/stop_words_russian.txt", header=None, names=['stopwords']) stop_words_ru = list(stopwords_ru.stopwords) data = pd.read_csv("/content/rt_final_1.csv", delimiter=';', error_bad_lines=False, quoting=3, header=None, names=['A', 'link', 'Title', 'Date', 'Text']) nlp = stanza.Pipeline('ru', processors='tokenize,lemma') corpus = data["Text"] print(corpus[1]) import string cleaned_corpus = [] for text in data["Text"]: if not isinstance(text, float): cleaned_text = ''.join([char for char in str(text) if char not in string.punctuation]) cleaned_corpus.append(cleaned_text) else: cleaned_corpus.append("") lemmatized_corpus = [] for text in cleaned_corpus: doc = nlp(text) lemmas = [word.lemma.lower() for sent in doc.sentences for word in sent.words if word.lemma.lower() not in stop_words_ru] lemmatized_corpus.append(' '.join(lemmas)) lemmatized_corpus bigram = Phrases([text.split() for text in lemmatized_corpus], min_count=2, threshold=1) trigram = Phrases(bigram[[text.split() for text in lemmatized_corpus]], min_count=2, threshold=1) bigram_corpus = [bigram[text.split()] for text in lemmatized_corpus] trigram_corpus = [trigram[bigram_text] for bigram_text in bigram_corpus] corpus_with_ngrams = [" ".join(text) for text in trigram_corpus] trigram_corpus tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=6) tfidf_matrix = tfidf_vectorizer.fit_transform(corpus_with_ngrams) num_topics = 13 lda = LatentDirichletAllocation(n_components=num_topics, random_state=50) lda.fit(tfidf_matrix) topic_terms = [] for topic in lda.components_: term_weights = [(tfidf_vectorizer.get_feature_names_out()[i], topic[i]) for i in topic.argsort()[::-1]] topic_terms.append(term_weights) for i, topic in enumerate(topic_terms): print(f"Тема {i + 1}:") for term, weight in topic[:10]: print(f"{term}: {weight:.2f}") print() import pandas as pd topic_distribution = lda.transform(tfidf_matrix) threshold = 0.5 assigned_topics = [topic.argmax() if topic.max() > threshold else -1 for topic in topic_distribution] topics_df = pd.DataFrame({'Topic': assigned_topics}) data_with_topics = pd.concat([data, topics_df], axis=1) data_with_topics.to_csv('rt_fulltext_topics_1.csv') import matplotlib.pyplot as plt import matplotlib.pyplot as plt from wordcloud import WordCloud lemmatized_text = ' '.join(lemmatized_corpus) wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lemmatized_text) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.show() wordcloud_words = wordcloud.words_ word_list = list(wordcloud_words.keys()) print(word_list) from wordcloud import WordCloud import matplotlib.pyplot as plt translation_dict = { 'украина': 'Ukraine', 'россия': 'Russia', 'заявить': 'declare', 'ранее': 'earlier', 'свой': 'own', 'страна': 'country', 'российский': 'Russian', 'слово': 'word', 'киев': 'Kiev', 'сообщить': 'report', 'украинский': 'Ukrainian', 'подчеркнуть': 'emphasize', 'отметить': 'note', 'февраль': 'February', 'владимир путин': 'Vladimir Putin', 'зеленский': 'zelensky', 'вопрос': 'issue', 'донбасс': 'donbass', 'ес': 'eu', 'решение': 'decision', 'военный операция': 'military operation', 'должен': 'should', 'сша': 'usa', 'говорить': 'speak', 'переговоры': 'negotiations', 'риа новость': 'ria novosti', 'москва': 'moscow', 'президент россия': 'president russia', 'сторона': 'side', 'президент': 'president', 'ситуация украина': 'situation ukraine', 'санкция': 'sanction', 'город': 'city', 'ход': 'move', 'специальный военный': 'special military', 'тема': 'topic', 'уефа': 'uefa', 'государство': 'state', 'мир': 'peace', 'нато': 'nato', 'добавить': 'add', 'запад': 'west', 'война': 'war', 'готовый': 'ready', 'народный республика': "people's republic", 'действие': 'action', 'военнослужащий': 'serviceman', 'делать': 'do', 'эксперт': 'expert', 'отношение': 'attitude', 'считать': 'count', 'рф': 'rf', 'rt': 'rt', 'позиция': 'position', 'число': 'number', 'гражданин': 'citizen', 'власть': 'power', 'цель': 'target', 'встреча': 'meeting', 'банк': 'bank', 'заявление': 'statement', 'напомнить': 'recall', 'европа': 'europe', 'владимир зеленский': 'vladimir zelensky', 'условие': 'condition', 'информация': 'information', 'территория': 'territory', 'безопасность': 'security', 'всу': 'vsu', 'отмечать': 'celebrate', 'подразделение': 'unit', 'политический': 'political', 'новый': 'new', 'мирный житель': 'civilian', 'связь': 'connection', 'международный': 'international', 'евросоюз': 'european union', 'днр лнр': 'dnr lnr', 'россия владимир': 'russia vladimir', 'минобороны россия': 'russia ministry of defense', 'оружие': 'weapons', 'ряд': 'row', 'мнение': 'opinion', 'боевой действие': 'combat action', 'использовать': 'use', 'возможность': 'opportunity', 'организация': 'organization', 'ситуация': 'situation', 'мера': 'measure', 'политика': 'policy', 'стать': 'become', 'республика': 'republic', 'помощь': 'help', 'самый': 'most', 'случай': 'case', 'сборная россия': 'national team russia', 'конфликт': 'conflict', 'никакой': 'none', 'получить': 'get', 'являться': 'appear', 'рассказать': 'tell', 'фифа': 'fifa', 'тысяча': 'thousand', 'официальный представитель': 'official representative', 'пояснить': 'explain', 'написать': 'write', 'западный': 'western', 'принять решение': 'make a decision', 'донецкий народный': "Donetsk people's", 'стыковой матч': 'clash match', 'угроза': 'threat', 'дело': 'case', 'любой': 'any', 'находиться': 'to be in', 'президент украина': 'president of ukraine', 'спецоперация украина': 'special operation ukraine', 'команда': 'command', 'сообщать': 'report', 'март': 'March', 'лицо': 'person', 'сила': 'force', 'призвать': 'call', 'польша': 'Poland', 'член': 'member', 'российский лидер': 'Russian leader', 'украинский военный': 'Ukrainian military', 'американский': 'American', 'право': 'right', 'погибнуть': 'die', 'финансовый': 'financial', 'проведение специальный': 'holding special', 'принять': 'take', 'вооружение': 'armament', 'вместе': 'together', 'пытаться': 'try', 'ранее сообщать': 'previously report', 'украинский националист': 'Ukrainian nationalist', 'начало': 'start', 'житель': 'resident', 'цитировать': 'quote', 'провести': 'spend', 'данные': 'data', 'повод': 'occasion', 'нужный': 'necessary', 'участие': 'participation', 'клуб': 'club', 'часть': 'part', 'говорить сообщение': 'speak message', 'свой очередь': 'their turn', 'военный': 'military', 'националист': 'nationalist', 'население': 'population', 'представитель': 'representative', 'украинский сторона': 'Ukrainian side', 'населить пункт': 'populate the point', 'официальный': 'official', 'направить': 'send', 'поддержка': 'support', 'кремль': 'Kremlin', 'происходить': 'take place', 'германия': 'Germany', 'российский военный': 'Russian military', 'текст статья': 'text article', 'глава': 'head', 'войска': 'troops', 'начать': 'start', 'воевать': 'fight', 'якобы': 'allegedly', 'поэтому': 'therefore', 'проблема': 'problem', 'лавров': 'Lavrov', 'оказаться': 'turn out', 'возможный': 'possible', 'задача': 'task', 'вооружить сила': 'arm the force', 'операция защита': 'operation defense', 'защита донбасс': 'defense of Donbass', 'народный милиция': "people's militia", 'вооруженный сила': 'armed force', 'глава мид': 'head of the mids', 'швеция чехия': 'sweden czechia', 'продолжать': 'continue', 'россиянин': 'russian', 'вести': 'vesti', 'событие': 'event', 'народ': 'people', 'необходимый': 'necessary', 'серьезный': 'serious', 'российский сторона': 'russian side', 'лига чемпион': 'league champion', 'операция украина': 'operation ukraine', 'план': 'plan', 'частность': 'private', 'нынешний': 'current', 'интерес': 'interest', 'рамка': 'frame', 'состояться': 'take place', 'поддержать': 'support', 'мариуполь': 'mariupol', 'чемпионат мир': 'world championship' } lemmatized_text = ' '.join(lemmatized_corpus) russian_text = lemmatized_text english_text = ' '.join(translation_dict.get(word, word) for word in russian_text.split()) wordcloud_russian = WordCloud(width=800, height=400, background_color='white').generate(russian_text) wordcloud_english = WordCloud(width=800, height=400, background_color='white').generate(english_text) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud_russian, interpolation='bilinear') plt.axis('off') plt.show() plt.figure(figsize=(10, 5)) plt.imshow(wordcloud_english, interpolation='bilinear') plt.axis('off') plt.show() import pandas as pd import matplotlib.pyplot as plt import matplotlib.dates as mdates !pip install matplotlib !pip install seaborn import pandas as pd import matplotlib.pyplot as plt import matplotlib.dates as mdates data = pd.read_csv("/content/rt_upraveno.csv", delimiter=';') data['Дата'] = pd.to_datetime(data['Date'], format='%d.%m.%Y') filtered_data = data[data['Topic'].isin([-1, 0]) == False] filtered_data = filtered_data.sort_values(by='Дата') data_grouped = filtered_data.groupby(['Дата', 'Topic']).size().unstack(fill_value=0) fig, ax = plt.subplots(figsize=(12, 6)) ax.xaxis.set_major_formatter(mdates.DateFormatter('%d.%m.%Y')) data_grouped.plot(kind='line', ax=ax) plt.xlabel('Date') plt.ylabel('Number of articles by topic') plt.title('Change of topics along the time axis (excluding -1, 0, topics)') plt.legend(title='Topic', bbox_to_anchor=(1.05, 1), loc='upper left') plt.grid(True) plt.xticks(fontsize=10) plt.yticks(fontsize=10) plt.legend(fontsize=10, title_fontsize=12) plt.show()