के साथ pycurl का उपयोग करते समय पढ़ने के लिए धन्यवाद।ERROR "अतिरिक्त डेटा: लाइन 2 कॉलम 1" gzip stream
पृष्ठभूमि: मैं एक स्ट्रीमिंग API फ़ीड कि JSON
प्रारूप में डेटा देता है पढ़ने के लिए कोशिश कर रहा हूँ, और फिर एक pymongo collection
को यह डेटा भंडारण। स्ट्रीमिंग एपीआई को "Accept-Encoding" : "Gzip"
हेडर की आवश्यकता होती है।
क्या हो रहा है: - Extra data: line 2 column 1 - line 4 column 1 (char 1891 - 5597)
(देखें त्रुटि लॉग नीचे)
हर JSON ऑब्जेक्ट पार्स करते समय ऐसा नहीं होता है - कोड पर json.loads
और आउटपुट में विफल रहता है यह यादृच्छिक पर होता है।
मेरा अनुमान है कि मुझे हर "एक्स" उचित JSON ऑब्जेक्ट्स के बाद कुछ अजीब JSON ऑब्जेक्ट का सामना करना पड़ रहा है।
मैंने how to use pycurl if requested data is sometimes gzipped, sometimes not? और Encoding error while deserializing a json object from Google संदर्भित किया लेकिन अब तक इस त्रुटि को हल करने में असफल रहा है।
क्या कोई मेरी मदद कर सकता है?
त्रुटि लॉग: नोट: JSON ऑब्जेक्ट नीचे मूल रूप से repr()
विधि है कि CRLF/वामो (रों) हल किए बिना स्ट्रिंग के कच्चे प्रतिनिधित्व प्रिंट उपयोग कर रहा है के कच्चे डंप।
'{"id":"tag:search.twitter.com,2005:207958320747782146","objectType":"activity","actor":{"objectType":"person","id":"id:twitter.com:493653150","link":"http://www.twitter.com/Deathnews_7_24","displayName":"Death News 7/24","postedTime":"2012-02-16T01:30:12.000Z","image":"http://a0.twimg.com/profile_images/1834408513/deathnewstwittersquare_normal.jpg","summary":"Crashes, Murders, Suicides, Accidents, Crime and Naturals Death News From All Around World","links":[{"href":"http://www.facebook.com/DeathNews724","rel":"me"}],"friendsCount":56,"followersCount":14,"listedCount":1,"statusesCount":1029,"twitterTimeZone":null,"utcOffset":null,"preferredUsername":"Deathnews_7_24","languages":["tr"]},"verb":"post","postedTime":"2012-05-30T22:15:02.000Z","generator":{"displayName":"web","link":"http://twitter.com"},"provider":{"objectType":"service","displayName":"Twitter","link":"http://www.twitter.com"},"link":"http://twitter.com/Deathnews_7_24/statuses/207958320747782146","body":"Kathi Kamen Goldmark, Writers\xe2\x80\x99 Catalyst, Dies at 63 http://t.co/WBsNlNtA","object":{"objectType":"note","id":"object:search.twitter.com,2005:207958320747782146","summary":"Kathi Kamen Goldmark, Writers\xe2\x80\x99 Catalyst, Dies at 63 http://t.co/WBsNlNtA","link":"http://twitter.com/Deathnews_7_24/statuses/207958320747782146","postedTime":"2012-05-30T22:15:02.000Z"},"twitter_entities":{"urls":[{"display_url":"nytimes.com/2012/05/30/boo\xe2\x80\xa6","indices":[52,72],"expanded_url":"http://www.nytimes.com/2012/05/30/books/kathi-kamen-goldmark-writers-catalyst-dies-at-63.html","url":"http://t.co/WBsNlNtA"}],"hashtags":[],"user_mentions":[]},"gnip":{"language":{"value":"en"},"matching_rules":[{"value":"url_contains: nytimes.com","tag":null}],"klout_score":11,"urls":[{"url":"http://t.co/WBsNlNtA","expanded_url":"http://www.nytimes.com/2012/05/30/books/kathi-kamen-goldmark-writers-catalyst-dies-at-63.html?_r=1"}]}}\r\n{"id":"tag:search.twitter.com,2005:03638785","objectType":"activity","actor":{"objectType":"person","id":"id:twitter.com:178760897","link":"http://www.twitter.com/Mobanu","displayName":"Donald Ochs","postedTime":"2010-08-15T16:33:56.000Z","image":"http://a0.twimg.com/profile_images/1493224811/small_mobany_Logo_normal.jpg","summary":"","links":[{"href":"http://www.mobanuweightloss.com","rel":"me"}],"friendsCount":10272,"followersCount":9698,"listedCount":30,"statusesCount":725,"twitterTimeZone":"Mountain Time (US & Canada)","utcOffset":"-25200","preferredUsername":"Mobanu","languages":["en"],"location":{"objectType":"place","displayName":"Crested Butte, Colorado"}},"verb":"post","postedTime":"2012-05-30T22:15:02.000Z","generator":{"displayName":"twitterfeed","link":"http://twitterfeed.com"},"provider":{"objectType":"service","displayName":"Twitter","link":"http://www.twitter.com"},"link":"http://twitter.com/Mobanu/statuses/03638785","body":"Mobanu: Can Exercise Be Bad for You?: Researchers have found evidence that some people who exercise do worse on ... http://t.co/mTsQlNQO","object":{"objectType":"note","id":"object:search.twitter.com,2005:03638785","summary":"Mobanu: Can Exercise Be Bad for You?: Researchers have found evidence that some people who exercise do worse on ... http://t.co/mTsQlNQO","link":"http://twitter.com/Mobanu/statuses/03638785","postedTime":"2012-05-30T22:15:02.000Z"},"twitter_entities":{"urls":[{"display_url":"nyti.ms/KUmmMa","indices":[116,136],"expanded_url":"http://nyti.ms/KUmmMa","url":"http://t.co/mTsQlNQO"}],"hashtags":[],"user_mentions":[]},"gnip":{"language":{"value":"en"},"matching_rules":[{"value":"url_contains: nytimes.com","tag":null}],"klout_score":12,"urls":[{"url":"http://t.co/mTsQlNQO","expanded_url":"http://well.blogs.nytimes.com/2012/05/30/can-exercise-be-bad-for-you/?utm_medium=twitter&utm_source=twitterfeed"}]}}\r\n'
json exception: Extra data: line 2 column 1 - line 4 column 1 (char 1891 - 5597)
हैडर आउटपुट:
HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8
Vary: Accept-Encoding
Date: Wed, 30 May 2012 22:14:48 UTC
Connection: close
Transfer-Encoding: chunked
Content-Encoding: gzip
get_stream.py:
#!/usr/bin/env python
import sys
import pycurl
import json
import pymongo
STREAM_URL = "https://stream.test.com:443/accounts/publishers/twitter/streams/track/Dev.json"
AUTH = "userid:passwd"
DB_HOST = "127.0.0.1"
DB_NAME = "stream_test"
class StreamReader:
def __init__(self):
try:
self.count = 0
self.buff = ""
self.mongo = pymongo.Connection(DB_HOST)
self.db = self.mongo[DB_NAME]
self.raw_tweets = self.db["raw_tweets_gnip"]
self.conn = pycurl.Curl()
self.conn.setopt(pycurl.ENCODING, 'gzip')
self.conn.setopt(pycurl.URL, STREAM_URL)
self.conn.setopt(pycurl.USERPWD, AUTH)
self.conn.setopt(pycurl.WRITEFUNCTION, self.on_receive)
self.conn.setopt(pycurl.HEADERFUNCTION, self.header_rcvd)
while True:
self.conn.perform()
except Exception as ex:
print "error ocurred : %s" % str(ex)
def header_rcvd(self, header_data):
print header_data
def on_receive(self, data):
temp_data = data
self.buff += data
if data.endswith("\r\n") and self.buff.strip():
try:
tweet = json.loads(self.buff, encoding = 'UTF-8')
self.buff = ""
if tweet:
try:
self.raw_tweets.insert(tweet)
except Exception as insert_ex:
print "Error inserting tweet: %s" % str(insert_ex)
self.count += 1
if self.count % 10 == 0:
print "inserted "+str(self.count)+" tweets"
except Exception as json_ex:
print "json exception: %s" % str(json_ex)
print repr(temp_data)
stream = StreamReader()
फिक्स्ड कोड:
def on_receive(self, data):
self.buff += data
if data.endswith("\r\n") and self.buff.strip():
# NEW: Split the buff at \r\n to get a list of JSON objects and iterate over them
json_obj = self.buff.split("\r\n")
for obj in json_obj:
if len(obj.strip()) > 0:
try:
tweet = json.loads(obj, encoding = 'UTF-8')
except Exception as json_ex:
print "JSON Exception occurred: %s" % str(json_ex)
continue
धन्यवाद !!! मुझे तुम्हारा एक पेय है, तुमने मेरा तनाव हल किया! – vgoklani