scrapy

furas · furas · commit de9e57d0a77d · 2017-11-18T19:12:28.000+01:00
diff --git a/scrapy/save-categories-in-separated-files/category/__init__.py b/scrapy/save-categories-in-separated-files/category/__init__.py
diff --git a/scrapy/save-categories-in-separated-files/category/items.py b/scrapy/save-categories-in-separated-files/category/items.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+class CategoryItem(scrapy.Item):
+    Title = scrapy.Field()
+    Date = scrapy.Field()
+    # extra field used as filename 
+    Category = scrapy.Field()
diff --git a/scrapy/save-categories-in-separated-files/category/middlewares.py b/scrapy/save-categories-in-separated-files/category/middlewares.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class CategorySpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/scrapy/save-categories-in-separated-files/category/pipelines.py b/scrapy/save-categories-in-separated-files/category/pipelines.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import csv
+
+class CategoryPipeline(object):
+    
+    def process_item(self, item, spider):
+        
+        # get category and use it as filename
+        filename = item['Category'] + '.csv'
+        
+        # open file for appending
+        with open(filename, 'a') as f:
+            writer = csv.writer(f)
+
+            # write only selected elements 
+            row = [item['Title'], item['Date']]
+            writer.writerow(row)
+
+            #write all data in row
+            #warning: item is dictionary so item.values() don't have to return always values in the same order
+            #writer.writerow(item.values())
+            
+        return item
diff --git a/scrapy/save-categories-in-separated-files/category/settings.py b/scrapy/save-categories-in-separated-files/category/settings.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for category project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'category'
+
+SPIDER_MODULES = ['category.spiders']
+NEWSPIDER_MODULE = 'category.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'category (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'category.middlewares.CategorySpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'category.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    'category.pipelines.CategoryPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/scrapy/save-categories-in-separated-files/category/spiders/__init__.py b/scrapy/save-categories-in-separated-files/category/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/scrapy/save-categories-in-separated-files/category/spiders/example.py b/scrapy/save-categories-in-separated-files/category/spiders/example.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+import scrapy
+
+class ExampleSpider(scrapy.Spider):
+    name = 'example'
+    allowed_domains = ['blog.furas.pl']
+    start_urls = ['http://blog.furas.pl/category/python.html','http://blog.furas.pl/category/html.html','http://blog.furas.pl/category/linux.html']
+
+    def parse(self, response):
+        
+        # get category from url
+        category = response.url.split('/')[-1][:-5]
+
+        urls = response.css('article a::attr(href)').extract() # links to den subpages
+        
+        for url in urls:
+            # skip some urls
+            if ('/tag/' not in url) and ('/category/' not in url):
+                url = response.urljoin(url)
+                # add category (as meta) to send it to callback function
+                yield scrapy.Request(url=url, callback=self.parse_details, meta={'category': category})
+
+    def parse_details(self, response):
+
+        # get category
+        category = response.meta['category']
+
+        # get only first title (or empty string '') and strip it
+        title = response.css('h1.entry-title a::text').extract_first('')
+        title = title.strip()
+        
+        # get only first date (or empty string '') and strip it
+        date = response.css('.published::text').extract_first('')
+        date = date.strip()
+        
+        yield {
+            'Title': title,
+            'Date': date,
+            'Category': category,
+        }
diff --git a/scrapy/save-categories-in-separated-files/html.csv b/scrapy/save-categories-in-separated-files/html.csv
@@ -0,0 +1,4 @@
+Jak uzyskać polskie znaki w Google Web Fonts,2012.09.05 środa
+Jak zrobić authorship dla wszystkich stron,2012.09.17 poniedziałek
+"Chowanie napisu ""NIEAKTUALNE"" z oferty otodom.pl",2012.10.14 niedziela
+"Status Skype'a w postaci ikony, tekstu lub liczby",2012.10.14 niedziela
diff --git a/scrapy/save-categories-in-separated-files/linux.csv b/scrapy/save-categories-in-separated-files/linux.csv
@@ -0,0 +1,9 @@
+Przycinanie nazwy pliku i katalogu w Bash,2012.09.17 poniedziałek
+Przestrzeń wolna (free) a dostępna (available) na dysku pod Linux,2015.07.24 piątek
+Sprawdzanie pod Linux rodzaju pamięci RAM,2015.07.27 poniedziałek
+Wyszukiwanie w Linux pakietu zawierającego brakujący plik dla Pythona,2015.08.05 środa
+Ściąga z komend Unix'a,2015.08.08 sobota
+WiFi - standardowe narzędzie,2011.02.25 piątek
+Uruchamianie windowsowej gry firmy Artifex Mundi pod Linux 64bit,2015.08.25 wtorek
+Nauka Linuksa za darmo na LinuxJourney.com,2016.06.08 środa
+Instalacja programu .msi w linuxie,2012.08.12 niedziela
diff --git a/scrapy/save-categories-in-separated-files/python.csv b/scrapy/save-categories-in-separated-files/python.csv
@@ -0,0 +1,20 @@
+Shedulers - wykonywanie zadań w odstępie czasu,2016.07.01 piątek
+"Książka ""Python For Everyone"" - zabójcza cena.",2016.07.16 sobota
+Bitly_API on Python 3,2017.01.01 niedziela
+Newslettery na temat Pythona,2016.06.24 piątek
+Nieoficjalne repozytorium wersji binarnych Pythona dla Ubuntu i Linux Mint,2016.01.30 sobota
+Open Browser czyli przeglądarka Web w Pythonie,2016.05.21 sobota
+"12 Portali, które uczą programowania za darmo (według Fortune)",2016.05.22 niedziela
+http-prompt - interaktywna linia komend HTTP w Pythonie,2016.05.22 niedziela
+httpie - narzędzie w stylu cURL,2016.05.22 niedziela
+Python Prompt Toolkit,2016.05.22 niedziela
+urlparse przykład,2016.05.22 niedziela
+PyCon 2016 nagrania video na YouTube,2016.06.01 środa
+Listy darmowych ebooków na temat Pythona,2016.05.27 piątek
+Dzień Informatyka 2016,2016.06.06 poniedziałek
+Różności - Python - 2016.06.12,2016.06.12 niedziela
+Wyniki ankiety PyCharm,2016.06.14 wtorek
+"Promocja 45% ""Złota 10-tka"" w helion.pl",2016.06.14 wtorek
+Książki w przygotowaniu,2016.06.24 piątek
+Promocja 45% na ebooki w ebookpoint.pl,2016.06.15 środa
+Helion promocja Wakacje [2016],2016.06.24 piątek
diff --git a/scrapy/save-categories-in-separated-files/scrapy.cfg b/scrapy/save-categories-in-separated-files/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = category.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = category
diff --git a/tkinter/grid-columnconfigure-rowconfigure/README.md b/tkinter/grid-columnconfigure-rowconfigure/README.md
@@ -211,3 +211,37 @@ root.mainloop()
 ![#1](images/main-10.png?raw=true)
 
 
+---
+
+```python
+import tkinter as tk
+
+root = tk.Tk()
+
+root.rowconfigure(1, weight=1)
+root.rowconfigure(2, weight=4)
+root.rowconfigure(3, weight=8)
+
+b0 = tk.Button(root, text="weight=0")
+b0.grid(column=0, row=0, stick='ns')
+
+b1 = tk.Button(root, text="weight=1")
+b1.grid(column=0, row=1, stick='ns')
+
+b2 = tk.Button(root, text="weight=4")
+b2.grid(column=0, row=2, stick='ns')
+
+b3 = tk.Button(root, text="weight=8")
+b3.grid(column=0, row=3, stick='ns')
+
+root.mainloop()
+```
+
+| at start |
+| --- |
+| ![#1](images/main-11.png?raw=true) |
+
+| resized (using mouse) |
+| --- |
+| ![#1](images/main-12.png?raw=true) |
+
diff --git a/tkinter/grid-columnconfigure-rowconfigure/images/main-11.png b/tkinter/grid-columnconfigure-rowconfigure/images/main-11.png
diff --git a/tkinter/grid-columnconfigure-rowconfigure/images/main-12.png b/tkinter/grid-columnconfigure-rowconfigure/images/main-12.png
diff --git a/tkinter/themes/main-1.py b/tkinter/themes/main-1.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+
+import tkinter as tk
+import tkinter.ttk as ttk
+
+def red_buttons():
+    s.configure('TButton', background='red')
+    root.after(200, green_buttons)
+    
+def green_buttons():
+    s.configure('TButton', background='green')
+    root.after(200, red_buttons)
+
+root = tk.Tk()
+
+s = ttk.Style()
+s.configure('TButton', background='pink')#, foreground='red')
+
+ttk.Button(root, text="1").pack()
+ttk.Button(root, text="2").pack()
+
+root.after(2000, red_buttons)
+
+root.mainloop()